X86ISelLowering.cpp revision 944061c4e152e9f66ffaaca5905253ba8012a4fa
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86ISelLowering.h"
17#include "Utils/X86ShuffleDecode.h"
18#include "X86.h"
19#include "X86InstrBuilder.h"
20#include "X86TargetMachine.h"
21#include "X86TargetObjectFile.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/Statistic.h"
24#include "llvm/ADT/StringExtras.h"
25#include "llvm/ADT/VariadicFunction.h"
26#include "llvm/CodeGen/IntrinsicLowering.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/MachineFunction.h"
29#include "llvm/CodeGen/MachineInstrBuilder.h"
30#include "llvm/CodeGen/MachineJumpTableInfo.h"
31#include "llvm/CodeGen/MachineModuleInfo.h"
32#include "llvm/CodeGen/MachineRegisterInfo.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/GlobalVariable.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/MC/MCAsmInfo.h"
43#include "llvm/MC/MCContext.h"
44#include "llvm/MC/MCExpr.h"
45#include "llvm/MC/MCSymbol.h"
46#include "llvm/Support/CallSite.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/ErrorHandling.h"
49#include "llvm/Support/MathExtras.h"
50#include "llvm/Target/TargetOptions.h"
51#include <bitset>
52#include <cctype>
53using namespace llvm;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57// Forward declarations.
58static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
59                       SDValue V2);
60
61/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
62/// sets things up to match to an AVX VEXTRACTF128 instruction or a
63/// simple subregister reference.  Idx is an index in the 128 bits we
64/// want.  It need not be aligned to a 128-bit bounday.  That makes
65/// lowering EXTRACT_VECTOR_ELT operations easier.
66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
67                                   SelectionDAG &DAG, SDLoc dl) {
68  EVT VT = Vec.getValueType();
69  assert(VT.is256BitVector() && "Unexpected vector size!");
70  EVT ElVT = VT.getVectorElementType();
71  unsigned Factor = VT.getSizeInBits()/128;
72  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
73                                  VT.getVectorNumElements()/Factor);
74
75  // Extract from UNDEF is UNDEF.
76  if (Vec.getOpcode() == ISD::UNDEF)
77    return DAG.getUNDEF(ResultVT);
78
79  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
80  // we can match to VEXTRACTF128.
81  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
82
83  // This is the index of the first element of the 128-bit chunk
84  // we want.
85  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
86                               * ElemsPerChunk);
87
88  // If the input is a buildvector just emit a smaller one.
89  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
90    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
91                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
92
93  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
94  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
95                               VecIdx);
96
97  return Result;
98}
99
100/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
101/// sets things up to match to an AVX VINSERTF128 instruction or a
102/// simple superregister reference.  Idx is an index in the 128 bits
103/// we want.  It need not be aligned to a 128-bit bounday.  That makes
104/// lowering INSERT_VECTOR_ELT operations easier.
105static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
106                                  unsigned IdxVal, SelectionDAG &DAG,
107                                  SDLoc dl) {
108  // Inserting UNDEF is Result
109  if (Vec.getOpcode() == ISD::UNDEF)
110    return Result;
111
112  EVT VT = Vec.getValueType();
113  assert(VT.is128BitVector() && "Unexpected vector size!");
114
115  EVT ElVT = VT.getVectorElementType();
116  EVT ResultVT = Result.getValueType();
117
118  // Insert the relevant 128 bits.
119  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
120
121  // This is the index of the first element of the 128-bit chunk
122  // we want.
123  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
124                               * ElemsPerChunk);
125
126  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
127  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
128                     VecIdx);
129}
130
131/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
132/// instructions. This is used because creating CONCAT_VECTOR nodes of
133/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
134/// large BUILD_VECTORS.
135static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
136                                   unsigned NumElems, SelectionDAG &DAG,
137                                   SDLoc dl) {
138  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
139  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
140}
141
142static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
143  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
144  bool is64Bit = Subtarget->is64Bit();
145
146  if (Subtarget->isTargetEnvMacho()) {
147    if (is64Bit)
148      return new X86_64MachoTargetObjectFile();
149    return new TargetLoweringObjectFileMachO();
150  }
151
152  if (Subtarget->isTargetLinux())
153    return new X86LinuxTargetObjectFile();
154  if (Subtarget->isTargetELF())
155    return new TargetLoweringObjectFileELF();
156  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
157    return new TargetLoweringObjectFileCOFF();
158  llvm_unreachable("unknown subtarget type");
159}
160
161X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
162  : TargetLowering(TM, createTLOF(TM)) {
163  Subtarget = &TM.getSubtarget<X86Subtarget>();
164  X86ScalarSSEf64 = Subtarget->hasSSE2();
165  X86ScalarSSEf32 = Subtarget->hasSSE1();
166  TD = getDataLayout();
167
168  resetOperationActions();
169}
170
171void X86TargetLowering::resetOperationActions() {
172  const TargetMachine &TM = getTargetMachine();
173  static bool FirstTimeThrough = true;
174
175  // If none of the target options have changed, then we don't need to reset the
176  // operation actions.
177  if (!FirstTimeThrough && TO == TM.Options) return;
178
179  if (!FirstTimeThrough) {
180    // Reinitialize the actions.
181    initActions();
182    FirstTimeThrough = false;
183  }
184
185  TO = TM.Options;
186
187  // Set up the TargetLowering object.
188  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
189
190  // X86 is weird, it always uses i8 for shift amounts and setcc results.
191  setBooleanContents(ZeroOrOneBooleanContent);
192  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
193  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
194
195  // For 64-bit since we have so many registers use the ILP scheduler, for
196  // 32-bit code use the register pressure specific scheduling.
197  // For Atom, always use ILP scheduling.
198  if (Subtarget->isAtom())
199    setSchedulingPreference(Sched::ILP);
200  else if (Subtarget->is64Bit())
201    setSchedulingPreference(Sched::ILP);
202  else
203    setSchedulingPreference(Sched::RegPressure);
204  const X86RegisterInfo *RegInfo =
205    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
206  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
207
208  // Bypass expensive divides on Atom when compiling with O2
209  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
210    addBypassSlowDiv(32, 8);
211    if (Subtarget->is64Bit())
212      addBypassSlowDiv(64, 16);
213  }
214
215  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
216    // Setup Windows compiler runtime calls.
217    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
218    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
219    setLibcallName(RTLIB::SREM_I64, "_allrem");
220    setLibcallName(RTLIB::UREM_I64, "_aullrem");
221    setLibcallName(RTLIB::MUL_I64, "_allmul");
222    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
223    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
224    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
225    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
226    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
227
228    // The _ftol2 runtime function has an unusual calling conv, which
229    // is modeled by a special pseudo-instruction.
230    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
231    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
232    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
233    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
234  }
235
236  if (Subtarget->isTargetDarwin()) {
237    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
238    setUseUnderscoreSetJmp(false);
239    setUseUnderscoreLongJmp(false);
240  } else if (Subtarget->isTargetMingw()) {
241    // MS runtime is weird: it exports _setjmp, but longjmp!
242    setUseUnderscoreSetJmp(true);
243    setUseUnderscoreLongJmp(false);
244  } else {
245    setUseUnderscoreSetJmp(true);
246    setUseUnderscoreLongJmp(true);
247  }
248
249  // Set up the register classes.
250  addRegisterClass(MVT::i8, &X86::GR8RegClass);
251  addRegisterClass(MVT::i16, &X86::GR16RegClass);
252  addRegisterClass(MVT::i32, &X86::GR32RegClass);
253  if (Subtarget->is64Bit())
254    addRegisterClass(MVT::i64, &X86::GR64RegClass);
255
256  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
257
258  // We don't accept any truncstore of integer registers.
259  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
260  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
261  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
262  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
263  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
264  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
265
266  // SETOEQ and SETUNE require checking two conditions.
267  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
268  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
269  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
270  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
271  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
272  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
273
274  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
275  // operation.
276  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
277  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
278  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
279
280  if (Subtarget->is64Bit()) {
281    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
282    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
283  } else if (!TM.Options.UseSoftFloat) {
284    // We have an algorithm for SSE2->double, and we turn this into a
285    // 64-bit FILD followed by conditional FADD for other targets.
286    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
287    // We have an algorithm for SSE2, and we turn this into a 64-bit
288    // FILD for other targets.
289    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
290  }
291
292  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
293  // this operation.
294  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
295  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
296
297  if (!TM.Options.UseSoftFloat) {
298    // SSE has no i16 to fp conversion, only i32
299    if (X86ScalarSSEf32) {
300      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
301      // f32 and f64 cases are Legal, f80 case is not
302      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
303    } else {
304      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
305      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
306    }
307  } else {
308    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
309    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
310  }
311
312  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
313  // are Legal, f80 is custom lowered.
314  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
315  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
316
317  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
318  // this operation.
319  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
320  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
321
322  if (X86ScalarSSEf32) {
323    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
324    // f32 and f64 cases are Legal, f80 case is not
325    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
326  } else {
327    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
328    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
329  }
330
331  // Handle FP_TO_UINT by promoting the destination to a larger signed
332  // conversion.
333  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
334  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
335  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
336
337  if (Subtarget->is64Bit()) {
338    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
339    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
340  } else if (!TM.Options.UseSoftFloat) {
341    // Since AVX is a superset of SSE3, only check for SSE here.
342    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
343      // Expand FP_TO_UINT into a select.
344      // FIXME: We would like to use a Custom expander here eventually to do
345      // the optimal thing for SSE vs. the default expansion in the legalizer.
346      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
347    else
348      // With SSE3 we can use fisttpll to convert to a signed i64; without
349      // SSE, we're stuck with a fistpll.
350      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
351  }
352
353  if (isTargetFTOL()) {
354    // Use the _ftol2 runtime function, which has a pseudo-instruction
355    // to handle its weird calling convention.
356    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
357  }
358
359  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
360  if (!X86ScalarSSEf64) {
361    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
362    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
363    if (Subtarget->is64Bit()) {
364      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
365      // Without SSE, i64->f64 goes through memory.
366      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
367    }
368  }
369
370  // Scalar integer divide and remainder are lowered to use operations that
371  // produce two results, to match the available instructions. This exposes
372  // the two-result form to trivial CSE, which is able to combine x/y and x%y
373  // into a single instruction.
374  //
375  // Scalar integer multiply-high is also lowered to use two-result
376  // operations, to match the available instructions. However, plain multiply
377  // (low) operations are left as Legal, as there are single-result
378  // instructions for this in x86. Using the two-result multiply instructions
379  // when both high and low results are needed must be arranged by dagcombine.
380  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
381    MVT VT = IntVTs[i];
382    setOperationAction(ISD::MULHS, VT, Expand);
383    setOperationAction(ISD::MULHU, VT, Expand);
384    setOperationAction(ISD::SDIV, VT, Expand);
385    setOperationAction(ISD::UDIV, VT, Expand);
386    setOperationAction(ISD::SREM, VT, Expand);
387    setOperationAction(ISD::UREM, VT, Expand);
388
389    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
390    setOperationAction(ISD::ADDC, VT, Custom);
391    setOperationAction(ISD::ADDE, VT, Custom);
392    setOperationAction(ISD::SUBC, VT, Custom);
393    setOperationAction(ISD::SUBE, VT, Custom);
394  }
395
396  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
397  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
398  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
399  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
400  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
401  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
402  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
403  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
404  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
405  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
406  if (Subtarget->is64Bit())
407    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
408  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
409  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
410  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
411  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
412  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
413  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
414  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
415  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
416
417  // Promote the i8 variants and force them on up to i32 which has a shorter
418  // encoding.
419  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
420  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
421  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
422  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
423  if (Subtarget->hasBMI()) {
424    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
425    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
426    if (Subtarget->is64Bit())
427      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
428  } else {
429    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
430    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
431    if (Subtarget->is64Bit())
432      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
433  }
434
435  if (Subtarget->hasLZCNT()) {
436    // When promoting the i8 variants, force them to i32 for a shorter
437    // encoding.
438    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
439    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
440    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
441    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
442    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
443    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
444    if (Subtarget->is64Bit())
445      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
446  } else {
447    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
448    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
449    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
450    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
451    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
452    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
453    if (Subtarget->is64Bit()) {
454      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
455      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
456    }
457  }
458
459  if (Subtarget->hasPOPCNT()) {
460    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
461  } else {
462    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
463    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
464    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
465    if (Subtarget->is64Bit())
466      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
467  }
468
469  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
470  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
471
472  // These should be promoted to a larger select which is supported.
473  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
474  // X86 wants to expand cmov itself.
475  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
476  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
477  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
478  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
479  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
480  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
481  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
482  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
483  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
484  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
485  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
486  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
487  if (Subtarget->is64Bit()) {
488    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
489    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
490  }
491  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
492  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
493  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
494  // support continuation, user-level threading, and etc.. As a result, no
495  // other SjLj exception interfaces are implemented and please don't build
496  // your own exception handling based on them.
497  // LLVM/Clang supports zero-cost DWARF exception handling.
498  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
499  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
500
501  // Darwin ABI issue.
502  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
503  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
504  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
505  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
506  if (Subtarget->is64Bit())
507    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
508  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
509  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
510  if (Subtarget->is64Bit()) {
511    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
512    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
513    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
514    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
515    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
516  }
517  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
518  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
519  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
520  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
521  if (Subtarget->is64Bit()) {
522    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
523    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
524    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
525  }
526
527  if (Subtarget->hasSSE1())
528    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
529
530  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
531
532  // Expand certain atomics
533  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
534    MVT VT = IntVTs[i];
535    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
536    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
537    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
538  }
539
540  if (!Subtarget->is64Bit()) {
541    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
542    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
543    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
544    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
545    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
546    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
547    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
548    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
549    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
550    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
551    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
552    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
553  }
554
555  if (Subtarget->hasCmpxchg16b()) {
556    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
557  }
558
559  // FIXME - use subtarget debug flags
560  if (!Subtarget->isTargetDarwin() &&
561      !Subtarget->isTargetELF() &&
562      !Subtarget->isTargetCygMing()) {
563    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
564  }
565
566  if (Subtarget->is64Bit()) {
567    setExceptionPointerRegister(X86::RAX);
568    setExceptionSelectorRegister(X86::RDX);
569  } else {
570    setExceptionPointerRegister(X86::EAX);
571    setExceptionSelectorRegister(X86::EDX);
572  }
573  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
574  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
575
576  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
577  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
578
579  setOperationAction(ISD::TRAP, MVT::Other, Legal);
580  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
581
582  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
583  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
584  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
585  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
586    // TargetInfo::X86_64ABIBuiltinVaList
587    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
588    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
589  } else {
590    // TargetInfo::CharPtrBuiltinVaList
591    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
592    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
593  }
594
595  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
596  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
597
598  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
599    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
600                       MVT::i64 : MVT::i32, Custom);
601  else if (TM.Options.EnableSegmentedStacks)
602    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
603                       MVT::i64 : MVT::i32, Custom);
604  else
605    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
606                       MVT::i64 : MVT::i32, Expand);
607
608  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
609    // f32 and f64 use SSE.
610    // Set up the FP register classes.
611    addRegisterClass(MVT::f32, &X86::FR32RegClass);
612    addRegisterClass(MVT::f64, &X86::FR64RegClass);
613
614    // Use ANDPD to simulate FABS.
615    setOperationAction(ISD::FABS , MVT::f64, Custom);
616    setOperationAction(ISD::FABS , MVT::f32, Custom);
617
618    // Use XORP to simulate FNEG.
619    setOperationAction(ISD::FNEG , MVT::f64, Custom);
620    setOperationAction(ISD::FNEG , MVT::f32, Custom);
621
622    // Use ANDPD and ORPD to simulate FCOPYSIGN.
623    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
624    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
625
626    // Lower this to FGETSIGNx86 plus an AND.
627    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
628    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
629
630    // We don't support sin/cos/fmod
631    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
632    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
633    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
634    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
635    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
636    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
637
638    // Expand FP immediates into loads from the stack, except for the special
639    // cases we handle.
640    addLegalFPImmediate(APFloat(+0.0)); // xorpd
641    addLegalFPImmediate(APFloat(+0.0f)); // xorps
642  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
643    // Use SSE for f32, x87 for f64.
644    // Set up the FP register classes.
645    addRegisterClass(MVT::f32, &X86::FR32RegClass);
646    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
647
648    // Use ANDPS to simulate FABS.
649    setOperationAction(ISD::FABS , MVT::f32, Custom);
650
651    // Use XORP to simulate FNEG.
652    setOperationAction(ISD::FNEG , MVT::f32, Custom);
653
654    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
655
656    // Use ANDPS and ORPS to simulate FCOPYSIGN.
657    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
658    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
659
660    // We don't support sin/cos/fmod
661    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
662    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
663    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
664
665    // Special cases we handle for FP constants.
666    addLegalFPImmediate(APFloat(+0.0f)); // xorps
667    addLegalFPImmediate(APFloat(+0.0)); // FLD0
668    addLegalFPImmediate(APFloat(+1.0)); // FLD1
669    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
670    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
671
672    if (!TM.Options.UnsafeFPMath) {
673      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
674      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
675      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
676    }
677  } else if (!TM.Options.UseSoftFloat) {
678    // f32 and f64 in x87.
679    // Set up the FP register classes.
680    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
681    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
682
683    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
684    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
685    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
686    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
687
688    if (!TM.Options.UnsafeFPMath) {
689      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
690      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
691      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
692      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
693      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
694      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
695    }
696    addLegalFPImmediate(APFloat(+0.0)); // FLD0
697    addLegalFPImmediate(APFloat(+1.0)); // FLD1
698    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
699    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
700    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
701    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
702    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
703    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
704  }
705
706  // We don't support FMA.
707  setOperationAction(ISD::FMA, MVT::f64, Expand);
708  setOperationAction(ISD::FMA, MVT::f32, Expand);
709
710  // Long double always uses X87.
711  if (!TM.Options.UseSoftFloat) {
712    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
713    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
714    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
715    {
716      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
717      addLegalFPImmediate(TmpFlt);  // FLD0
718      TmpFlt.changeSign();
719      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
720
721      bool ignored;
722      APFloat TmpFlt2(+1.0);
723      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
724                      &ignored);
725      addLegalFPImmediate(TmpFlt2);  // FLD1
726      TmpFlt2.changeSign();
727      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
728    }
729
730    if (!TM.Options.UnsafeFPMath) {
731      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
732      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
733      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
734    }
735
736    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
737    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
738    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
739    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
740    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
741    setOperationAction(ISD::FMA, MVT::f80, Expand);
742  }
743
744  // Always use a library call for pow.
745  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
746  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
747  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
748
749  setOperationAction(ISD::FLOG, MVT::f80, Expand);
750  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
751  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
752  setOperationAction(ISD::FEXP, MVT::f80, Expand);
753  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
754
755  // First set operation action for all vector types to either promote
756  // (for widening) or expand (for scalarization). Then we will selectively
757  // turn on ones that can be effectively codegen'd.
758  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
759           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
760    MVT VT = (MVT::SimpleValueType)i;
761    setOperationAction(ISD::ADD , VT, Expand);
762    setOperationAction(ISD::SUB , VT, Expand);
763    setOperationAction(ISD::FADD, VT, Expand);
764    setOperationAction(ISD::FNEG, VT, Expand);
765    setOperationAction(ISD::FSUB, VT, Expand);
766    setOperationAction(ISD::MUL , VT, Expand);
767    setOperationAction(ISD::FMUL, VT, Expand);
768    setOperationAction(ISD::SDIV, VT, Expand);
769    setOperationAction(ISD::UDIV, VT, Expand);
770    setOperationAction(ISD::FDIV, VT, Expand);
771    setOperationAction(ISD::SREM, VT, Expand);
772    setOperationAction(ISD::UREM, VT, Expand);
773    setOperationAction(ISD::LOAD, VT, Expand);
774    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
775    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
776    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
777    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
778    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
779    setOperationAction(ISD::FABS, VT, Expand);
780    setOperationAction(ISD::FSIN, VT, Expand);
781    setOperationAction(ISD::FSINCOS, VT, Expand);
782    setOperationAction(ISD::FCOS, VT, Expand);
783    setOperationAction(ISD::FSINCOS, VT, Expand);
784    setOperationAction(ISD::FREM, VT, Expand);
785    setOperationAction(ISD::FMA,  VT, Expand);
786    setOperationAction(ISD::FPOWI, VT, Expand);
787    setOperationAction(ISD::FSQRT, VT, Expand);
788    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
789    setOperationAction(ISD::FFLOOR, VT, Expand);
790    setOperationAction(ISD::FCEIL, VT, Expand);
791    setOperationAction(ISD::FTRUNC, VT, Expand);
792    setOperationAction(ISD::FRINT, VT, Expand);
793    setOperationAction(ISD::FNEARBYINT, VT, Expand);
794    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
795    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
796    setOperationAction(ISD::SDIVREM, VT, Expand);
797    setOperationAction(ISD::UDIVREM, VT, Expand);
798    setOperationAction(ISD::FPOW, VT, Expand);
799    setOperationAction(ISD::CTPOP, VT, Expand);
800    setOperationAction(ISD::CTTZ, VT, Expand);
801    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
802    setOperationAction(ISD::CTLZ, VT, Expand);
803    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
804    setOperationAction(ISD::SHL, VT, Expand);
805    setOperationAction(ISD::SRA, VT, Expand);
806    setOperationAction(ISD::SRL, VT, Expand);
807    setOperationAction(ISD::ROTL, VT, Expand);
808    setOperationAction(ISD::ROTR, VT, Expand);
809    setOperationAction(ISD::BSWAP, VT, Expand);
810    setOperationAction(ISD::SETCC, VT, Expand);
811    setOperationAction(ISD::FLOG, VT, Expand);
812    setOperationAction(ISD::FLOG2, VT, Expand);
813    setOperationAction(ISD::FLOG10, VT, Expand);
814    setOperationAction(ISD::FEXP, VT, Expand);
815    setOperationAction(ISD::FEXP2, VT, Expand);
816    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
817    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
818    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
819    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
820    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
821    setOperationAction(ISD::TRUNCATE, VT, Expand);
822    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
823    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
824    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
825    setOperationAction(ISD::VSELECT, VT, Expand);
826    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
827             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
828      setTruncStoreAction(VT,
829                          (MVT::SimpleValueType)InnerVT, Expand);
830    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
831    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
832    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
833  }
834
835  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
836  // with -msoft-float, disable use of MMX as well.
837  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
838    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
839    // No operations on x86mmx supported, everything uses intrinsics.
840  }
841
842  // MMX-sized vectors (other than x86mmx) are expected to be expanded
843  // into smaller operations.
844  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
845  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
846  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
847  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
848  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
849  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
850  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
851  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
852  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
853  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
854  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
855  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
856  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
857  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
858  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
859  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
860  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
861  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
862  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
863  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
864  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
865  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
866  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
867  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
868  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
869  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
870  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
871  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
872  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
873
874  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
875    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
876
877    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
878    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
879    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
880    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
881    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
882    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
883    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
884    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
885    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
886    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
887    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
888    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
889  }
890
891  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
892    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
893
894    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
895    // registers cannot be used even for integer operations.
896    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
897    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
898    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
899    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
900
901    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
902    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
903    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
904    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
905    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
906    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
907    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
908    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
909    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
910    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
911    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
912    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
913    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
914    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
915    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
916    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
917    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
918    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
919
920    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
921    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
922    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
923    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
924
925    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
926    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
927    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
928    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
929    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
930
931    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
932    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
933      MVT VT = (MVT::SimpleValueType)i;
934      // Do not attempt to custom lower non-power-of-2 vectors
935      if (!isPowerOf2_32(VT.getVectorNumElements()))
936        continue;
937      // Do not attempt to custom lower non-128-bit vectors
938      if (!VT.is128BitVector())
939        continue;
940      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
941      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
942      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
943    }
944
945    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
946    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
947    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
948    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
949    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
950    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
951
952    if (Subtarget->is64Bit()) {
953      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
954      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
955    }
956
957    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
958    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
959      MVT VT = (MVT::SimpleValueType)i;
960
961      // Do not attempt to promote non-128-bit vectors
962      if (!VT.is128BitVector())
963        continue;
964
965      setOperationAction(ISD::AND,    VT, Promote);
966      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
967      setOperationAction(ISD::OR,     VT, Promote);
968      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
969      setOperationAction(ISD::XOR,    VT, Promote);
970      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
971      setOperationAction(ISD::LOAD,   VT, Promote);
972      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
973      setOperationAction(ISD::SELECT, VT, Promote);
974      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
975    }
976
977    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
978
979    // Custom lower v2i64 and v2f64 selects.
980    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
981    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
982    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
983    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
984
985    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
986    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
987
988    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
989    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
990    // As there is no 64-bit GPR available, we need build a special custom
991    // sequence to convert from v2i32 to v2f32.
992    if (!Subtarget->is64Bit())
993      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
994
995    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
996    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
997
998    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
999  }
1000
1001  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1002    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1003    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1004    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1005    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1006    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1007    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1008    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1009    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1010    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1011    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1012
1013    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1014    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1015    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1016    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1017    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1018    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1019    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1020    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1021    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1022    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1023
1024    // FIXME: Do we need to handle scalar-to-vector here?
1025    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1026
1027    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
1028    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
1029    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1030    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
1031    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
1032
1033    // i8 and i16 vectors are custom , because the source register and source
1034    // source memory operand types are not the same width.  f32 vectors are
1035    // custom since the immediate controlling the insert encodes additional
1036    // information.
1037    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1038    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1039    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1040    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1041
1042    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1043    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1044    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1045    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1046
1047    // FIXME: these should be Legal but thats only for the case where
1048    // the index is constant.  For now custom expand to deal with that.
1049    if (Subtarget->is64Bit()) {
1050      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1051      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1052    }
1053  }
1054
1055  if (Subtarget->hasSSE2()) {
1056    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1057    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1058
1059    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1060    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1061
1062    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1063    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1064
1065    // In the customized shift lowering, the legal cases in AVX2 will be
1066    // recognized.
1067    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1068    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1069
1070    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1071    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1072
1073    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1074
1075    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
1076    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
1077  }
1078
1079  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1080    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1081    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1082    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1083    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1084    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1085    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1086
1087    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1088    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1089    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1090
1091    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1092    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1093    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1094    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1095    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1096    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1097    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1098    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1099    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1100    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1101    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1102    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1103
1104    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1105    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1106    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1107    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1108    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1109    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1110    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1111    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1112    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1113    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1114    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1115    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1116
1117    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
1118    setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
1119
1120    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
1121
1122    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1123    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1124    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1125    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1126
1127    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
1128    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1129    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1130
1131    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
1132
1133    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1134    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1135
1136    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1137    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1138
1139    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1140    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1141
1142    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
1143
1144    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1145    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1146    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1147    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1148
1149    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1150    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1151    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1152
1153    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
1154    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
1155    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
1156    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
1157
1158    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1159    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1160    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1161    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1162    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1163    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1164
1165    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1166      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1167      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1168      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1169      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1170      setOperationAction(ISD::FMA,             MVT::f32, Legal);
1171      setOperationAction(ISD::FMA,             MVT::f64, Legal);
1172    }
1173
1174    if (Subtarget->hasInt256()) {
1175      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1176      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1177      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1178      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1179
1180      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1181      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1182      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1183      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1184
1185      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1186      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1187      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1188      // Don't lower v32i8 because there is no 128-bit byte mul
1189
1190      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1191
1192      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
1193    } else {
1194      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1195      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1196      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1197      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1198
1199      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1200      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1201      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1202      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1203
1204      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1205      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1206      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1207      // Don't lower v32i8 because there is no 128-bit byte mul
1208    }
1209
1210    // In the customized shift lowering, the legal cases in AVX2 will be
1211    // recognized.
1212    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1213    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1214
1215    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1216    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1217
1218    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1219
1220    // Custom lower several nodes for 256-bit types.
1221    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1222             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1223      MVT VT = (MVT::SimpleValueType)i;
1224
1225      // Extract subvector is special because the value type
1226      // (result) is 128-bit but the source is 256-bit wide.
1227      if (VT.is128BitVector())
1228        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1229
1230      // Do not attempt to custom lower other non-256-bit vectors
1231      if (!VT.is256BitVector())
1232        continue;
1233
1234      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1235      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1236      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1237      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1238      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1239      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1240      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1241    }
1242
1243    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1244    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1245      MVT VT = (MVT::SimpleValueType)i;
1246
1247      // Do not attempt to promote non-256-bit vectors
1248      if (!VT.is256BitVector())
1249        continue;
1250
1251      setOperationAction(ISD::AND,    VT, Promote);
1252      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1253      setOperationAction(ISD::OR,     VT, Promote);
1254      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1255      setOperationAction(ISD::XOR,    VT, Promote);
1256      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1257      setOperationAction(ISD::LOAD,   VT, Promote);
1258      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1259      setOperationAction(ISD::SELECT, VT, Promote);
1260      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1261    }
1262  }
1263
1264  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1265  // of this type with custom code.
1266  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1267           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1268    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1269                       Custom);
1270  }
1271
1272  // We want to custom lower some of our intrinsics.
1273  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1274  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1275
1276  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1277  // handle type legalization for these operations here.
1278  //
1279  // FIXME: We really should do custom legalization for addition and
1280  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1281  // than generic legalization for 64-bit multiplication-with-overflow, though.
1282  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1283    // Add/Sub/Mul with overflow operations are custom lowered.
1284    MVT VT = IntVTs[i];
1285    setOperationAction(ISD::SADDO, VT, Custom);
1286    setOperationAction(ISD::UADDO, VT, Custom);
1287    setOperationAction(ISD::SSUBO, VT, Custom);
1288    setOperationAction(ISD::USUBO, VT, Custom);
1289    setOperationAction(ISD::SMULO, VT, Custom);
1290    setOperationAction(ISD::UMULO, VT, Custom);
1291  }
1292
1293  // There are no 8-bit 3-address imul/mul instructions
1294  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1295  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1296
1297  if (!Subtarget->is64Bit()) {
1298    // These libcalls are not available in 32-bit.
1299    setLibcallName(RTLIB::SHL_I128, 0);
1300    setLibcallName(RTLIB::SRL_I128, 0);
1301    setLibcallName(RTLIB::SRA_I128, 0);
1302  }
1303
1304  // Combine sin / cos into one node or libcall if possible.
1305  if (Subtarget->hasSinCos()) {
1306    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1307    setLibcallName(RTLIB::SINCOS_F64, "sincos");
1308    if (Subtarget->isTargetDarwin()) {
1309      // For MacOSX, we don't want to the normal expansion of a libcall to
1310      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
1311      // traffic.
1312      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1313      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1314    }
1315  }
1316
1317  // We have target-specific dag combine patterns for the following nodes:
1318  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1319  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1320  setTargetDAGCombine(ISD::VSELECT);
1321  setTargetDAGCombine(ISD::SELECT);
1322  setTargetDAGCombine(ISD::SHL);
1323  setTargetDAGCombine(ISD::SRA);
1324  setTargetDAGCombine(ISD::SRL);
1325  setTargetDAGCombine(ISD::OR);
1326  setTargetDAGCombine(ISD::AND);
1327  setTargetDAGCombine(ISD::ADD);
1328  setTargetDAGCombine(ISD::FADD);
1329  setTargetDAGCombine(ISD::FSUB);
1330  setTargetDAGCombine(ISD::FMA);
1331  setTargetDAGCombine(ISD::SUB);
1332  setTargetDAGCombine(ISD::LOAD);
1333  setTargetDAGCombine(ISD::STORE);
1334  setTargetDAGCombine(ISD::ZERO_EXTEND);
1335  setTargetDAGCombine(ISD::ANY_EXTEND);
1336  setTargetDAGCombine(ISD::SIGN_EXTEND);
1337  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1338  setTargetDAGCombine(ISD::TRUNCATE);
1339  setTargetDAGCombine(ISD::SINT_TO_FP);
1340  setTargetDAGCombine(ISD::SETCC);
1341  if (Subtarget->is64Bit())
1342    setTargetDAGCombine(ISD::MUL);
1343  setTargetDAGCombine(ISD::XOR);
1344
1345  computeRegisterProperties();
1346
1347  // On Darwin, -Os means optimize for size without hurting performance,
1348  // do not reduce the limit.
1349  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1350  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1351  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1352  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1353  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1354  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1355  setPrefLoopAlignment(4); // 2^4 bytes.
1356
1357  // Predictable cmov don't hurt on atom because it's in-order.
1358  PredictableSelectIsExpensive = !Subtarget->isAtom();
1359
1360  setPrefFunctionAlignment(4); // 2^4 bytes.
1361}
1362
1363EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1364  if (!VT.isVector()) return MVT::i8;
1365  return VT.changeVectorElementTypeToInteger();
1366}
1367
1368/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1369/// the desired ByVal argument alignment.
1370static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1371  if (MaxAlign == 16)
1372    return;
1373  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1374    if (VTy->getBitWidth() == 128)
1375      MaxAlign = 16;
1376  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1377    unsigned EltAlign = 0;
1378    getMaxByValAlign(ATy->getElementType(), EltAlign);
1379    if (EltAlign > MaxAlign)
1380      MaxAlign = EltAlign;
1381  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1382    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1383      unsigned EltAlign = 0;
1384      getMaxByValAlign(STy->getElementType(i), EltAlign);
1385      if (EltAlign > MaxAlign)
1386        MaxAlign = EltAlign;
1387      if (MaxAlign == 16)
1388        break;
1389    }
1390  }
1391}
1392
1393/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1394/// function arguments in the caller parameter area. For X86, aggregates
1395/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1396/// are at 4-byte boundaries.
1397unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1398  if (Subtarget->is64Bit()) {
1399    // Max of 8 and alignment of type.
1400    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1401    if (TyAlign > 8)
1402      return TyAlign;
1403    return 8;
1404  }
1405
1406  unsigned Align = 4;
1407  if (Subtarget->hasSSE1())
1408    getMaxByValAlign(Ty, Align);
1409  return Align;
1410}
1411
1412/// getOptimalMemOpType - Returns the target specific optimal type for load
1413/// and store operations as a result of memset, memcpy, and memmove
1414/// lowering. If DstAlign is zero that means it's safe to destination
1415/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1416/// means there isn't a need to check it against alignment requirement,
1417/// probably because the source does not need to be loaded. If 'IsMemset' is
1418/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1419/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1420/// source is constant so it does not need to be loaded.
1421/// It returns EVT::Other if the type should be determined using generic
1422/// target-independent logic.
1423EVT
1424X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1425                                       unsigned DstAlign, unsigned SrcAlign,
1426                                       bool IsMemset, bool ZeroMemset,
1427                                       bool MemcpyStrSrc,
1428                                       MachineFunction &MF) const {
1429  const Function *F = MF.getFunction();
1430  if ((!IsMemset || ZeroMemset) &&
1431      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1432                                       Attribute::NoImplicitFloat)) {
1433    if (Size >= 16 &&
1434        (Subtarget->isUnalignedMemAccessFast() ||
1435         ((DstAlign == 0 || DstAlign >= 16) &&
1436          (SrcAlign == 0 || SrcAlign >= 16)))) {
1437      if (Size >= 32) {
1438        if (Subtarget->hasInt256())
1439          return MVT::v8i32;
1440        if (Subtarget->hasFp256())
1441          return MVT::v8f32;
1442      }
1443      if (Subtarget->hasSSE2())
1444        return MVT::v4i32;
1445      if (Subtarget->hasSSE1())
1446        return MVT::v4f32;
1447    } else if (!MemcpyStrSrc && Size >= 8 &&
1448               !Subtarget->is64Bit() &&
1449               Subtarget->hasSSE2()) {
1450      // Do not use f64 to lower memcpy if source is string constant. It's
1451      // better to use i32 to avoid the loads.
1452      return MVT::f64;
1453    }
1454  }
1455  if (Subtarget->is64Bit() && Size >= 8)
1456    return MVT::i64;
1457  return MVT::i32;
1458}
1459
1460bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1461  if (VT == MVT::f32)
1462    return X86ScalarSSEf32;
1463  else if (VT == MVT::f64)
1464    return X86ScalarSSEf64;
1465  return true;
1466}
1467
1468bool
1469X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
1470  if (Fast)
1471    *Fast = Subtarget->isUnalignedMemAccessFast();
1472  return true;
1473}
1474
1475/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1476/// current function.  The returned value is a member of the
1477/// MachineJumpTableInfo::JTEntryKind enum.
1478unsigned X86TargetLowering::getJumpTableEncoding() const {
1479  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1480  // symbol.
1481  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1482      Subtarget->isPICStyleGOT())
1483    return MachineJumpTableInfo::EK_Custom32;
1484
1485  // Otherwise, use the normal jump table encoding heuristics.
1486  return TargetLowering::getJumpTableEncoding();
1487}
1488
1489const MCExpr *
1490X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1491                                             const MachineBasicBlock *MBB,
1492                                             unsigned uid,MCContext &Ctx) const{
1493  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1494         Subtarget->isPICStyleGOT());
1495  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1496  // entries.
1497  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1498                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1499}
1500
1501/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1502/// jumptable.
1503SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1504                                                    SelectionDAG &DAG) const {
1505  if (!Subtarget->is64Bit())
1506    // This doesn't have SDLoc associated with it, but is not really the
1507    // same as a Register.
1508    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1509  return Table;
1510}
1511
1512/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1513/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1514/// MCExpr.
1515const MCExpr *X86TargetLowering::
1516getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1517                             MCContext &Ctx) const {
1518  // X86-64 uses RIP relative addressing based on the jump table label.
1519  if (Subtarget->isPICStyleRIPRel())
1520    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1521
1522  // Otherwise, the reference is relative to the PIC base.
1523  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1524}
1525
1526// FIXME: Why this routine is here? Move to RegInfo!
1527std::pair<const TargetRegisterClass*, uint8_t>
1528X86TargetLowering::findRepresentativeClass(MVT VT) const{
1529  const TargetRegisterClass *RRC = 0;
1530  uint8_t Cost = 1;
1531  switch (VT.SimpleTy) {
1532  default:
1533    return TargetLowering::findRepresentativeClass(VT);
1534  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1535    RRC = Subtarget->is64Bit() ?
1536      (const TargetRegisterClass*)&X86::GR64RegClass :
1537      (const TargetRegisterClass*)&X86::GR32RegClass;
1538    break;
1539  case MVT::x86mmx:
1540    RRC = &X86::VR64RegClass;
1541    break;
1542  case MVT::f32: case MVT::f64:
1543  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1544  case MVT::v4f32: case MVT::v2f64:
1545  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1546  case MVT::v4f64:
1547    RRC = &X86::VR128RegClass;
1548    break;
1549  }
1550  return std::make_pair(RRC, Cost);
1551}
1552
1553bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1554                                               unsigned &Offset) const {
1555  if (!Subtarget->isTargetLinux())
1556    return false;
1557
1558  if (Subtarget->is64Bit()) {
1559    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1560    Offset = 0x28;
1561    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1562      AddressSpace = 256;
1563    else
1564      AddressSpace = 257;
1565  } else {
1566    // %gs:0x14 on i386
1567    Offset = 0x14;
1568    AddressSpace = 256;
1569  }
1570  return true;
1571}
1572
1573//===----------------------------------------------------------------------===//
1574//               Return Value Calling Convention Implementation
1575//===----------------------------------------------------------------------===//
1576
1577#include "X86GenCallingConv.inc"
1578
1579bool
1580X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1581                                  MachineFunction &MF, bool isVarArg,
1582                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1583                        LLVMContext &Context) const {
1584  SmallVector<CCValAssign, 16> RVLocs;
1585  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1586                 RVLocs, Context);
1587  return CCInfo.CheckReturn(Outs, RetCC_X86);
1588}
1589
1590SDValue
1591X86TargetLowering::LowerReturn(SDValue Chain,
1592                               CallingConv::ID CallConv, bool isVarArg,
1593                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1594                               const SmallVectorImpl<SDValue> &OutVals,
1595                               SDLoc dl, SelectionDAG &DAG) const {
1596  MachineFunction &MF = DAG.getMachineFunction();
1597  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1598
1599  SmallVector<CCValAssign, 16> RVLocs;
1600  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1601                 RVLocs, *DAG.getContext());
1602  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1603
1604  SDValue Flag;
1605  SmallVector<SDValue, 6> RetOps;
1606  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1607  // Operand #1 = Bytes To Pop
1608  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1609                   MVT::i16));
1610
1611  // Copy the result values into the output registers.
1612  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1613    CCValAssign &VA = RVLocs[i];
1614    assert(VA.isRegLoc() && "Can only return in registers!");
1615    SDValue ValToCopy = OutVals[i];
1616    EVT ValVT = ValToCopy.getValueType();
1617
1618    // Promote values to the appropriate types
1619    if (VA.getLocInfo() == CCValAssign::SExt)
1620      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1621    else if (VA.getLocInfo() == CCValAssign::ZExt)
1622      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1623    else if (VA.getLocInfo() == CCValAssign::AExt)
1624      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1625    else if (VA.getLocInfo() == CCValAssign::BCvt)
1626      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1627
1628    // If this is x86-64, and we disabled SSE, we can't return FP values,
1629    // or SSE or MMX vectors.
1630    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1631         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1632          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1633      report_fatal_error("SSE register return with SSE disabled");
1634    }
1635    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1636    // llvm-gcc has never done it right and no one has noticed, so this
1637    // should be OK for now.
1638    if (ValVT == MVT::f64 &&
1639        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1640      report_fatal_error("SSE2 register return with SSE2 disabled");
1641
1642    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1643    // the RET instruction and handled by the FP Stackifier.
1644    if (VA.getLocReg() == X86::ST0 ||
1645        VA.getLocReg() == X86::ST1) {
1646      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1647      // change the value to the FP stack register class.
1648      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1649        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1650      RetOps.push_back(ValToCopy);
1651      // Don't emit a copytoreg.
1652      continue;
1653    }
1654
1655    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1656    // which is returned in RAX / RDX.
1657    if (Subtarget->is64Bit()) {
1658      if (ValVT == MVT::x86mmx) {
1659        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1660          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1661          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1662                                  ValToCopy);
1663          // If we don't have SSE2 available, convert to v4f32 so the generated
1664          // register is legal.
1665          if (!Subtarget->hasSSE2())
1666            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1667        }
1668      }
1669    }
1670
1671    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1672    Flag = Chain.getValue(1);
1673    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1674  }
1675
1676  // The x86-64 ABIs require that for returning structs by value we copy
1677  // the sret argument into %rax/%eax (depending on ABI) for the return.
1678  // Win32 requires us to put the sret argument to %eax as well.
1679  // We saved the argument into a virtual register in the entry block,
1680  // so now we copy the value out and into %rax/%eax.
1681  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
1682      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
1683    MachineFunction &MF = DAG.getMachineFunction();
1684    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1685    unsigned Reg = FuncInfo->getSRetReturnReg();
1686    assert(Reg &&
1687           "SRetReturnReg should have been set in LowerFormalArguments().");
1688    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1689
1690    unsigned RetValReg
1691        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
1692          X86::RAX : X86::EAX;
1693    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
1694    Flag = Chain.getValue(1);
1695
1696    // RAX/EAX now acts like a return value.
1697    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
1698  }
1699
1700  RetOps[0] = Chain;  // Update chain.
1701
1702  // Add the flag if we have it.
1703  if (Flag.getNode())
1704    RetOps.push_back(Flag);
1705
1706  return DAG.getNode(X86ISD::RET_FLAG, dl,
1707                     MVT::Other, &RetOps[0], RetOps.size());
1708}
1709
1710bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1711  if (N->getNumValues() != 1)
1712    return false;
1713  if (!N->hasNUsesOfValue(1, 0))
1714    return false;
1715
1716  SDValue TCChain = Chain;
1717  SDNode *Copy = *N->use_begin();
1718  if (Copy->getOpcode() == ISD::CopyToReg) {
1719    // If the copy has a glue operand, we conservatively assume it isn't safe to
1720    // perform a tail call.
1721    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1722      return false;
1723    TCChain = Copy->getOperand(0);
1724  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1725    return false;
1726
1727  bool HasRet = false;
1728  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1729       UI != UE; ++UI) {
1730    if (UI->getOpcode() != X86ISD::RET_FLAG)
1731      return false;
1732    HasRet = true;
1733  }
1734
1735  if (!HasRet)
1736    return false;
1737
1738  Chain = TCChain;
1739  return true;
1740}
1741
1742MVT
1743X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
1744                                            ISD::NodeType ExtendKind) const {
1745  MVT ReturnMVT;
1746  // TODO: Is this also valid on 32-bit?
1747  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1748    ReturnMVT = MVT::i8;
1749  else
1750    ReturnMVT = MVT::i32;
1751
1752  MVT MinVT = getRegisterType(ReturnMVT);
1753  return VT.bitsLT(MinVT) ? MinVT : VT;
1754}
1755
1756/// LowerCallResult - Lower the result values of a call into the
1757/// appropriate copies out of appropriate physical registers.
1758///
1759SDValue
1760X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1761                                   CallingConv::ID CallConv, bool isVarArg,
1762                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1763                                   SDLoc dl, SelectionDAG &DAG,
1764                                   SmallVectorImpl<SDValue> &InVals) const {
1765
1766  // Assign locations to each value returned by this call.
1767  SmallVector<CCValAssign, 16> RVLocs;
1768  bool Is64Bit = Subtarget->is64Bit();
1769  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1770                 getTargetMachine(), RVLocs, *DAG.getContext());
1771  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1772
1773  // Copy all of the result registers out of their specified physreg.
1774  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1775    CCValAssign &VA = RVLocs[i];
1776    EVT CopyVT = VA.getValVT();
1777
1778    // If this is x86-64, and we disabled SSE, we can't return FP values
1779    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1780        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1781      report_fatal_error("SSE register return with SSE disabled");
1782    }
1783
1784    SDValue Val;
1785
1786    // If this is a call to a function that returns an fp value on the floating
1787    // point stack, we must guarantee the value is popped from the stack, so
1788    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1789    // if the return value is not used. We use the FpPOP_RETVAL instruction
1790    // instead.
1791    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1792      // If we prefer to use the value in xmm registers, copy it out as f80 and
1793      // use a truncate to move it from fp stack reg to xmm reg.
1794      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1795      SDValue Ops[] = { Chain, InFlag };
1796      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1797                                         MVT::Other, MVT::Glue, Ops), 1);
1798      Val = Chain.getValue(0);
1799
1800      // Round the f80 to the right size, which also moves it to the appropriate
1801      // xmm register.
1802      if (CopyVT != VA.getValVT())
1803        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1804                          // This truncation won't change the value.
1805                          DAG.getIntPtrConstant(1));
1806    } else {
1807      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1808                                 CopyVT, InFlag).getValue(1);
1809      Val = Chain.getValue(0);
1810    }
1811    InFlag = Chain.getValue(2);
1812    InVals.push_back(Val);
1813  }
1814
1815  return Chain;
1816}
1817
1818//===----------------------------------------------------------------------===//
1819//                C & StdCall & Fast Calling Convention implementation
1820//===----------------------------------------------------------------------===//
1821//  StdCall calling convention seems to be standard for many Windows' API
1822//  routines and around. It differs from C calling convention just a little:
1823//  callee should clean up the stack, not caller. Symbols should be also
1824//  decorated in some fancy way :) It doesn't support any vector arguments.
1825//  For info on fast calling convention see Fast Calling Convention (tail call)
1826//  implementation LowerX86_32FastCCCallTo.
1827
1828/// CallIsStructReturn - Determines whether a call uses struct return
1829/// semantics.
1830enum StructReturnType {
1831  NotStructReturn,
1832  RegStructReturn,
1833  StackStructReturn
1834};
1835static StructReturnType
1836callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1837  if (Outs.empty())
1838    return NotStructReturn;
1839
1840  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
1841  if (!Flags.isSRet())
1842    return NotStructReturn;
1843  if (Flags.isInReg())
1844    return RegStructReturn;
1845  return StackStructReturn;
1846}
1847
1848/// ArgsAreStructReturn - Determines whether a function uses struct
1849/// return semantics.
1850static StructReturnType
1851argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1852  if (Ins.empty())
1853    return NotStructReturn;
1854
1855  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
1856  if (!Flags.isSRet())
1857    return NotStructReturn;
1858  if (Flags.isInReg())
1859    return RegStructReturn;
1860  return StackStructReturn;
1861}
1862
1863/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1864/// by "Src" to address "Dst" with size and alignment information specified by
1865/// the specific parameter attribute. The copy will be passed as a byval
1866/// function parameter.
1867static SDValue
1868CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1869                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1870                          SDLoc dl) {
1871  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1872
1873  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1874                       /*isVolatile*/false, /*AlwaysInline=*/true,
1875                       MachinePointerInfo(), MachinePointerInfo());
1876}
1877
1878/// IsTailCallConvention - Return true if the calling convention is one that
1879/// supports tail call optimization.
1880static bool IsTailCallConvention(CallingConv::ID CC) {
1881  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1882          CC == CallingConv::HiPE);
1883}
1884
1885/// \brief Return true if the calling convention is a C calling convention.
1886static bool IsCCallConvention(CallingConv::ID CC) {
1887  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
1888          CC == CallingConv::X86_64_SysV);
1889}
1890
1891bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1892  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
1893    return false;
1894
1895  CallSite CS(CI);
1896  CallingConv::ID CalleeCC = CS.getCallingConv();
1897  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
1898    return false;
1899
1900  return true;
1901}
1902
1903/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1904/// a tailcall target by changing its ABI.
1905static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
1906                                   bool GuaranteedTailCallOpt) {
1907  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1908}
1909
1910SDValue
1911X86TargetLowering::LowerMemArgument(SDValue Chain,
1912                                    CallingConv::ID CallConv,
1913                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1914                                    SDLoc dl, SelectionDAG &DAG,
1915                                    const CCValAssign &VA,
1916                                    MachineFrameInfo *MFI,
1917                                    unsigned i) const {
1918  // Create the nodes corresponding to a load from this parameter slot.
1919  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1920  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
1921                              getTargetMachine().Options.GuaranteedTailCallOpt);
1922  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1923  EVT ValVT;
1924
1925  // If value is passed by pointer we have address passed instead of the value
1926  // itself.
1927  if (VA.getLocInfo() == CCValAssign::Indirect)
1928    ValVT = VA.getLocVT();
1929  else
1930    ValVT = VA.getValVT();
1931
1932  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1933  // changed with more analysis.
1934  // In case of tail call optimization mark all arguments mutable. Since they
1935  // could be overwritten by lowering of arguments in case of a tail call.
1936  if (Flags.isByVal()) {
1937    unsigned Bytes = Flags.getByValSize();
1938    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1939    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
1940    return DAG.getFrameIndex(FI, getPointerTy());
1941  } else {
1942    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1943                                    VA.getLocMemOffset(), isImmutable);
1944    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1945    return DAG.getLoad(ValVT, dl, Chain, FIN,
1946                       MachinePointerInfo::getFixedStack(FI),
1947                       false, false, false, 0);
1948  }
1949}
1950
1951SDValue
1952X86TargetLowering::LowerFormalArguments(SDValue Chain,
1953                                        CallingConv::ID CallConv,
1954                                        bool isVarArg,
1955                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1956                                        SDLoc dl,
1957                                        SelectionDAG &DAG,
1958                                        SmallVectorImpl<SDValue> &InVals)
1959                                          const {
1960  MachineFunction &MF = DAG.getMachineFunction();
1961  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1962
1963  const Function* Fn = MF.getFunction();
1964  if (Fn->hasExternalLinkage() &&
1965      Subtarget->isTargetCygMing() &&
1966      Fn->getName() == "main")
1967    FuncInfo->setForceFramePointer(true);
1968
1969  MachineFrameInfo *MFI = MF.getFrameInfo();
1970  bool Is64Bit = Subtarget->is64Bit();
1971  bool IsWindows = Subtarget->isTargetWindows();
1972  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
1973
1974  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1975         "Var args not supported with calling convention fastcc, ghc or hipe");
1976
1977  // Assign locations to all of the incoming arguments.
1978  SmallVector<CCValAssign, 16> ArgLocs;
1979  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1980                 ArgLocs, *DAG.getContext());
1981
1982  // Allocate shadow area for Win64
1983  if (IsWin64)
1984    CCInfo.AllocateStack(32, 8);
1985
1986  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
1987
1988  unsigned LastVal = ~0U;
1989  SDValue ArgValue;
1990  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1991    CCValAssign &VA = ArgLocs[i];
1992    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1993    // places.
1994    assert(VA.getValNo() != LastVal &&
1995           "Don't support value assigned to multiple locs yet");
1996    (void)LastVal;
1997    LastVal = VA.getValNo();
1998
1999    if (VA.isRegLoc()) {
2000      EVT RegVT = VA.getLocVT();
2001      const TargetRegisterClass *RC;
2002      if (RegVT == MVT::i32)
2003        RC = &X86::GR32RegClass;
2004      else if (Is64Bit && RegVT == MVT::i64)
2005        RC = &X86::GR64RegClass;
2006      else if (RegVT == MVT::f32)
2007        RC = &X86::FR32RegClass;
2008      else if (RegVT == MVT::f64)
2009        RC = &X86::FR64RegClass;
2010      else if (RegVT.is256BitVector())
2011        RC = &X86::VR256RegClass;
2012      else if (RegVT.is128BitVector())
2013        RC = &X86::VR128RegClass;
2014      else if (RegVT == MVT::x86mmx)
2015        RC = &X86::VR64RegClass;
2016      else
2017        llvm_unreachable("Unknown argument type!");
2018
2019      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2020      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2021
2022      // If this is an 8 or 16-bit value, it is really passed promoted to 32
2023      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2024      // right size.
2025      if (VA.getLocInfo() == CCValAssign::SExt)
2026        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2027                               DAG.getValueType(VA.getValVT()));
2028      else if (VA.getLocInfo() == CCValAssign::ZExt)
2029        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2030                               DAG.getValueType(VA.getValVT()));
2031      else if (VA.getLocInfo() == CCValAssign::BCvt)
2032        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2033
2034      if (VA.isExtInLoc()) {
2035        // Handle MMX values passed in XMM regs.
2036        if (RegVT.isVector())
2037          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2038        else
2039          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2040      }
2041    } else {
2042      assert(VA.isMemLoc());
2043      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2044    }
2045
2046    // If value is passed via pointer - do a load.
2047    if (VA.getLocInfo() == CCValAssign::Indirect)
2048      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2049                             MachinePointerInfo(), false, false, false, 0);
2050
2051    InVals.push_back(ArgValue);
2052  }
2053
2054  // The x86-64 ABIs require that for returning structs by value we copy
2055  // the sret argument into %rax/%eax (depending on ABI) for the return.
2056  // Win32 requires us to put the sret argument to %eax as well.
2057  // Save the argument into a virtual register so that we can access it
2058  // from the return points.
2059  if (MF.getFunction()->hasStructRetAttr() &&
2060      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
2061    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2062    unsigned Reg = FuncInfo->getSRetReturnReg();
2063    if (!Reg) {
2064      MVT PtrTy = getPointerTy();
2065      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2066      FuncInfo->setSRetReturnReg(Reg);
2067    }
2068    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
2069    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2070  }
2071
2072  unsigned StackSize = CCInfo.getNextStackOffset();
2073  // Align stack specially for tail calls.
2074  if (FuncIsMadeTailCallSafe(CallConv,
2075                             MF.getTarget().Options.GuaranteedTailCallOpt))
2076    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2077
2078  // If the function takes variable number of arguments, make a frame index for
2079  // the start of the first vararg value... for expansion of llvm.va_start.
2080  if (isVarArg) {
2081    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2082                    CallConv != CallingConv::X86_ThisCall)) {
2083      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
2084    }
2085    if (Is64Bit) {
2086      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
2087
2088      // FIXME: We should really autogenerate these arrays
2089      static const uint16_t GPR64ArgRegsWin64[] = {
2090        X86::RCX, X86::RDX, X86::R8,  X86::R9
2091      };
2092      static const uint16_t GPR64ArgRegs64Bit[] = {
2093        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2094      };
2095      static const uint16_t XMMArgRegs64Bit[] = {
2096        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2097        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2098      };
2099      const uint16_t *GPR64ArgRegs;
2100      unsigned NumXMMRegs = 0;
2101
2102      if (IsWin64) {
2103        // The XMM registers which might contain var arg parameters are shadowed
2104        // in their paired GPR.  So we only need to save the GPR to their home
2105        // slots.
2106        TotalNumIntRegs = 4;
2107        GPR64ArgRegs = GPR64ArgRegsWin64;
2108      } else {
2109        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
2110        GPR64ArgRegs = GPR64ArgRegs64Bit;
2111
2112        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
2113                                                TotalNumXMMRegs);
2114      }
2115      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
2116                                                       TotalNumIntRegs);
2117
2118      bool NoImplicitFloatOps = Fn->getAttributes().
2119        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2120      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2121             "SSE register cannot be used when SSE is disabled!");
2122      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
2123               NoImplicitFloatOps) &&
2124             "SSE register cannot be used when SSE is disabled!");
2125      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2126          !Subtarget->hasSSE1())
2127        // Kernel mode asks for SSE to be disabled, so don't push them
2128        // on the stack.
2129        TotalNumXMMRegs = 0;
2130
2131      if (IsWin64) {
2132        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
2133        // Get to the caller-allocated home save location.  Add 8 to account
2134        // for the return address.
2135        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2136        FuncInfo->setRegSaveFrameIndex(
2137          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2138        // Fixup to set vararg frame on shadow area (4 x i64).
2139        if (NumIntRegs < 4)
2140          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2141      } else {
2142        // For X86-64, if there are vararg parameters that are passed via
2143        // registers, then we must store them to their spots on the stack so
2144        // they may be loaded by deferencing the result of va_next.
2145        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2146        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2147        FuncInfo->setRegSaveFrameIndex(
2148          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2149                               false));
2150      }
2151
2152      // Store the integer parameter registers.
2153      SmallVector<SDValue, 8> MemOps;
2154      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2155                                        getPointerTy());
2156      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2157      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2158        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2159                                  DAG.getIntPtrConstant(Offset));
2160        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2161                                     &X86::GR64RegClass);
2162        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2163        SDValue Store =
2164          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2165                       MachinePointerInfo::getFixedStack(
2166                         FuncInfo->getRegSaveFrameIndex(), Offset),
2167                       false, false, 0);
2168        MemOps.push_back(Store);
2169        Offset += 8;
2170      }
2171
2172      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2173        // Now store the XMM (fp + vector) parameter registers.
2174        SmallVector<SDValue, 11> SaveXMMOps;
2175        SaveXMMOps.push_back(Chain);
2176
2177        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2178        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2179        SaveXMMOps.push_back(ALVal);
2180
2181        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2182                               FuncInfo->getRegSaveFrameIndex()));
2183        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2184                               FuncInfo->getVarArgsFPOffset()));
2185
2186        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2187          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2188                                       &X86::VR128RegClass);
2189          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2190          SaveXMMOps.push_back(Val);
2191        }
2192        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2193                                     MVT::Other,
2194                                     &SaveXMMOps[0], SaveXMMOps.size()));
2195      }
2196
2197      if (!MemOps.empty())
2198        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2199                            &MemOps[0], MemOps.size());
2200    }
2201  }
2202
2203  // Some CCs need callee pop.
2204  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2205                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2206    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2207  } else {
2208    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2209    // If this is an sret function, the return should pop the hidden pointer.
2210    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2211        argsAreStructReturn(Ins) == StackStructReturn)
2212      FuncInfo->setBytesToPopOnReturn(4);
2213  }
2214
2215  if (!Is64Bit) {
2216    // RegSaveFrameIndex is X86-64 only.
2217    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2218    if (CallConv == CallingConv::X86_FastCall ||
2219        CallConv == CallingConv::X86_ThisCall)
2220      // fastcc functions can't have varargs.
2221      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2222  }
2223
2224  FuncInfo->setArgumentStackSize(StackSize);
2225
2226  return Chain;
2227}
2228
2229SDValue
2230X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2231                                    SDValue StackPtr, SDValue Arg,
2232                                    SDLoc dl, SelectionDAG &DAG,
2233                                    const CCValAssign &VA,
2234                                    ISD::ArgFlagsTy Flags) const {
2235  unsigned LocMemOffset = VA.getLocMemOffset();
2236  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2237  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2238  if (Flags.isByVal())
2239    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2240
2241  return DAG.getStore(Chain, dl, Arg, PtrOff,
2242                      MachinePointerInfo::getStack(LocMemOffset),
2243                      false, false, 0);
2244}
2245
2246/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2247/// optimization is performed and it is required.
2248SDValue
2249X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2250                                           SDValue &OutRetAddr, SDValue Chain,
2251                                           bool IsTailCall, bool Is64Bit,
2252                                           int FPDiff, SDLoc dl) const {
2253  // Adjust the Return address stack slot.
2254  EVT VT = getPointerTy();
2255  OutRetAddr = getReturnAddressFrameIndex(DAG);
2256
2257  // Load the "old" Return address.
2258  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2259                           false, false, false, 0);
2260  return SDValue(OutRetAddr.getNode(), 1);
2261}
2262
2263/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2264/// optimization is performed and it is required (FPDiff!=0).
2265static SDValue
2266EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
2267                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
2268                         unsigned SlotSize, int FPDiff, SDLoc dl) {
2269  // Store the return address to the appropriate stack slot.
2270  if (!FPDiff) return Chain;
2271  // Calculate the new stack slot for the return address.
2272  int NewReturnAddrFI =
2273    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
2274  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2275  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2276                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2277                       false, false, 0);
2278  return Chain;
2279}
2280
2281SDValue
2282X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2283                             SmallVectorImpl<SDValue> &InVals) const {
2284  SelectionDAG &DAG                     = CLI.DAG;
2285  SDLoc &dl                             = CLI.DL;
2286  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2287  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2288  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2289  SDValue Chain                         = CLI.Chain;
2290  SDValue Callee                        = CLI.Callee;
2291  CallingConv::ID CallConv              = CLI.CallConv;
2292  bool &isTailCall                      = CLI.IsTailCall;
2293  bool isVarArg                         = CLI.IsVarArg;
2294
2295  MachineFunction &MF = DAG.getMachineFunction();
2296  bool Is64Bit        = Subtarget->is64Bit();
2297  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2298  bool IsWindows      = Subtarget->isTargetWindows();
2299  StructReturnType SR = callIsStructReturn(Outs);
2300  bool IsSibcall      = false;
2301
2302  if (MF.getTarget().Options.DisableTailCalls)
2303    isTailCall = false;
2304
2305  if (isTailCall) {
2306    // Check if it's really possible to do a tail call.
2307    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2308                    isVarArg, SR != NotStructReturn,
2309                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2310                    Outs, OutVals, Ins, DAG);
2311
2312    // Sibcalls are automatically detected tailcalls which do not require
2313    // ABI changes.
2314    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2315      IsSibcall = true;
2316
2317    if (isTailCall)
2318      ++NumTailCalls;
2319  }
2320
2321  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2322         "Var args not supported with calling convention fastcc, ghc or hipe");
2323
2324  // Analyze operands of the call, assigning locations to each operand.
2325  SmallVector<CCValAssign, 16> ArgLocs;
2326  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2327                 ArgLocs, *DAG.getContext());
2328
2329  // Allocate shadow area for Win64
2330  if (IsWin64)
2331    CCInfo.AllocateStack(32, 8);
2332
2333  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2334
2335  // Get a count of how many bytes are to be pushed on the stack.
2336  unsigned NumBytes = CCInfo.getNextStackOffset();
2337  if (IsSibcall)
2338    // This is a sibcall. The memory operands are available in caller's
2339    // own caller's stack.
2340    NumBytes = 0;
2341  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2342           IsTailCallConvention(CallConv))
2343    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2344
2345  int FPDiff = 0;
2346  if (isTailCall && !IsSibcall) {
2347    // Lower arguments at fp - stackoffset + fpdiff.
2348    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2349    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2350
2351    FPDiff = NumBytesCallerPushed - NumBytes;
2352
2353    // Set the delta of movement of the returnaddr stackslot.
2354    // But only set if delta is greater than previous delta.
2355    if (FPDiff < X86Info->getTCReturnAddrDelta())
2356      X86Info->setTCReturnAddrDelta(FPDiff);
2357  }
2358
2359  if (!IsSibcall)
2360    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
2361                                 dl);
2362
2363  SDValue RetAddrFrIdx;
2364  // Load return address for tail calls.
2365  if (isTailCall && FPDiff)
2366    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2367                                    Is64Bit, FPDiff, dl);
2368
2369  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2370  SmallVector<SDValue, 8> MemOpChains;
2371  SDValue StackPtr;
2372
2373  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2374  // of tail call optimization arguments are handle later.
2375  const X86RegisterInfo *RegInfo =
2376    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
2377  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2378    CCValAssign &VA = ArgLocs[i];
2379    EVT RegVT = VA.getLocVT();
2380    SDValue Arg = OutVals[i];
2381    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2382    bool isByVal = Flags.isByVal();
2383
2384    // Promote the value if needed.
2385    switch (VA.getLocInfo()) {
2386    default: llvm_unreachable("Unknown loc info!");
2387    case CCValAssign::Full: break;
2388    case CCValAssign::SExt:
2389      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2390      break;
2391    case CCValAssign::ZExt:
2392      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2393      break;
2394    case CCValAssign::AExt:
2395      if (RegVT.is128BitVector()) {
2396        // Special case: passing MMX values in XMM registers.
2397        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2398        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2399        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2400      } else
2401        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2402      break;
2403    case CCValAssign::BCvt:
2404      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2405      break;
2406    case CCValAssign::Indirect: {
2407      // Store the argument.
2408      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2409      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2410      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2411                           MachinePointerInfo::getFixedStack(FI),
2412                           false, false, 0);
2413      Arg = SpillSlot;
2414      break;
2415    }
2416    }
2417
2418    if (VA.isRegLoc()) {
2419      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2420      if (isVarArg && IsWin64) {
2421        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2422        // shadow reg if callee is a varargs function.
2423        unsigned ShadowReg = 0;
2424        switch (VA.getLocReg()) {
2425        case X86::XMM0: ShadowReg = X86::RCX; break;
2426        case X86::XMM1: ShadowReg = X86::RDX; break;
2427        case X86::XMM2: ShadowReg = X86::R8; break;
2428        case X86::XMM3: ShadowReg = X86::R9; break;
2429        }
2430        if (ShadowReg)
2431          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2432      }
2433    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2434      assert(VA.isMemLoc());
2435      if (StackPtr.getNode() == 0)
2436        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2437                                      getPointerTy());
2438      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2439                                             dl, DAG, VA, Flags));
2440    }
2441  }
2442
2443  if (!MemOpChains.empty())
2444    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2445                        &MemOpChains[0], MemOpChains.size());
2446
2447  if (Subtarget->isPICStyleGOT()) {
2448    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2449    // GOT pointer.
2450    if (!isTailCall) {
2451      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2452               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2453    } else {
2454      // If we are tail calling and generating PIC/GOT style code load the
2455      // address of the callee into ECX. The value in ecx is used as target of
2456      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2457      // for tail calls on PIC/GOT architectures. Normally we would just put the
2458      // address of GOT into ebx and then call target@PLT. But for tail calls
2459      // ebx would be restored (since ebx is callee saved) before jumping to the
2460      // target@PLT.
2461
2462      // Note: The actual moving to ECX is done further down.
2463      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2464      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2465          !G->getGlobal()->hasProtectedVisibility())
2466        Callee = LowerGlobalAddress(Callee, DAG);
2467      else if (isa<ExternalSymbolSDNode>(Callee))
2468        Callee = LowerExternalSymbol(Callee, DAG);
2469    }
2470  }
2471
2472  if (Is64Bit && isVarArg && !IsWin64) {
2473    // From AMD64 ABI document:
2474    // For calls that may call functions that use varargs or stdargs
2475    // (prototype-less calls or calls to functions containing ellipsis (...) in
2476    // the declaration) %al is used as hidden argument to specify the number
2477    // of SSE registers used. The contents of %al do not need to match exactly
2478    // the number of registers, but must be an ubound on the number of SSE
2479    // registers used and is in the range 0 - 8 inclusive.
2480
2481    // Count the number of XMM registers allocated.
2482    static const uint16_t XMMArgRegs[] = {
2483      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2484      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2485    };
2486    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2487    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2488           && "SSE registers cannot be used when SSE is disabled");
2489
2490    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2491                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
2492  }
2493
2494  // For tail calls lower the arguments to the 'real' stack slot.
2495  if (isTailCall) {
2496    // Force all the incoming stack arguments to be loaded from the stack
2497    // before any new outgoing arguments are stored to the stack, because the
2498    // outgoing stack slots may alias the incoming argument stack slots, and
2499    // the alias isn't otherwise explicit. This is slightly more conservative
2500    // than necessary, because it means that each store effectively depends
2501    // on every argument instead of just those arguments it would clobber.
2502    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2503
2504    SmallVector<SDValue, 8> MemOpChains2;
2505    SDValue FIN;
2506    int FI = 0;
2507    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2508      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2509        CCValAssign &VA = ArgLocs[i];
2510        if (VA.isRegLoc())
2511          continue;
2512        assert(VA.isMemLoc());
2513        SDValue Arg = OutVals[i];
2514        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2515        // Create frame index.
2516        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2517        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2518        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2519        FIN = DAG.getFrameIndex(FI, getPointerTy());
2520
2521        if (Flags.isByVal()) {
2522          // Copy relative to framepointer.
2523          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2524          if (StackPtr.getNode() == 0)
2525            StackPtr = DAG.getCopyFromReg(Chain, dl,
2526                                          RegInfo->getStackRegister(),
2527                                          getPointerTy());
2528          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2529
2530          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2531                                                           ArgChain,
2532                                                           Flags, DAG, dl));
2533        } else {
2534          // Store relative to framepointer.
2535          MemOpChains2.push_back(
2536            DAG.getStore(ArgChain, dl, Arg, FIN,
2537                         MachinePointerInfo::getFixedStack(FI),
2538                         false, false, 0));
2539        }
2540      }
2541    }
2542
2543    if (!MemOpChains2.empty())
2544      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2545                          &MemOpChains2[0], MemOpChains2.size());
2546
2547    // Store the return address to the appropriate stack slot.
2548    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2549                                     getPointerTy(), RegInfo->getSlotSize(),
2550                                     FPDiff, dl);
2551  }
2552
2553  // Build a sequence of copy-to-reg nodes chained together with token chain
2554  // and flag operands which copy the outgoing args into registers.
2555  SDValue InFlag;
2556  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2557    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2558                             RegsToPass[i].second, InFlag);
2559    InFlag = Chain.getValue(1);
2560  }
2561
2562  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2563    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2564    // In the 64-bit large code model, we have to make all calls
2565    // through a register, since the call instruction's 32-bit
2566    // pc-relative offset may not be large enough to hold the whole
2567    // address.
2568  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2569    // If the callee is a GlobalAddress node (quite common, every direct call
2570    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2571    // it.
2572
2573    // We should use extra load for direct calls to dllimported functions in
2574    // non-JIT mode.
2575    const GlobalValue *GV = G->getGlobal();
2576    if (!GV->hasDLLImportLinkage()) {
2577      unsigned char OpFlags = 0;
2578      bool ExtraLoad = false;
2579      unsigned WrapperKind = ISD::DELETED_NODE;
2580
2581      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2582      // external symbols most go through the PLT in PIC mode.  If the symbol
2583      // has hidden or protected visibility, or if it is static or local, then
2584      // we don't need to use the PLT - we can directly call it.
2585      if (Subtarget->isTargetELF() &&
2586          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2587          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2588        OpFlags = X86II::MO_PLT;
2589      } else if (Subtarget->isPICStyleStubAny() &&
2590                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2591                 (!Subtarget->getTargetTriple().isMacOSX() ||
2592                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2593        // PC-relative references to external symbols should go through $stub,
2594        // unless we're building with the leopard linker or later, which
2595        // automatically synthesizes these stubs.
2596        OpFlags = X86II::MO_DARWIN_STUB;
2597      } else if (Subtarget->isPICStyleRIPRel() &&
2598                 isa<Function>(GV) &&
2599                 cast<Function>(GV)->getAttributes().
2600                   hasAttribute(AttributeSet::FunctionIndex,
2601                                Attribute::NonLazyBind)) {
2602        // If the function is marked as non-lazy, generate an indirect call
2603        // which loads from the GOT directly. This avoids runtime overhead
2604        // at the cost of eager binding (and one extra byte of encoding).
2605        OpFlags = X86II::MO_GOTPCREL;
2606        WrapperKind = X86ISD::WrapperRIP;
2607        ExtraLoad = true;
2608      }
2609
2610      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2611                                          G->getOffset(), OpFlags);
2612
2613      // Add a wrapper if needed.
2614      if (WrapperKind != ISD::DELETED_NODE)
2615        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2616      // Add extra indirection if needed.
2617      if (ExtraLoad)
2618        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2619                             MachinePointerInfo::getGOT(),
2620                             false, false, false, 0);
2621    }
2622  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2623    unsigned char OpFlags = 0;
2624
2625    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2626    // external symbols should go through the PLT.
2627    if (Subtarget->isTargetELF() &&
2628        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2629      OpFlags = X86II::MO_PLT;
2630    } else if (Subtarget->isPICStyleStubAny() &&
2631               (!Subtarget->getTargetTriple().isMacOSX() ||
2632                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2633      // PC-relative references to external symbols should go through $stub,
2634      // unless we're building with the leopard linker or later, which
2635      // automatically synthesizes these stubs.
2636      OpFlags = X86II::MO_DARWIN_STUB;
2637    }
2638
2639    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2640                                         OpFlags);
2641  }
2642
2643  // Returns a chain & a flag for retval copy to use.
2644  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2645  SmallVector<SDValue, 8> Ops;
2646
2647  if (!IsSibcall && isTailCall) {
2648    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2649                           DAG.getIntPtrConstant(0, true), InFlag, dl);
2650    InFlag = Chain.getValue(1);
2651  }
2652
2653  Ops.push_back(Chain);
2654  Ops.push_back(Callee);
2655
2656  if (isTailCall)
2657    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2658
2659  // Add argument registers to the end of the list so that they are known live
2660  // into the call.
2661  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2662    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2663                                  RegsToPass[i].second.getValueType()));
2664
2665  // Add a register mask operand representing the call-preserved registers.
2666  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2667  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2668  assert(Mask && "Missing call preserved mask for calling convention");
2669  Ops.push_back(DAG.getRegisterMask(Mask));
2670
2671  if (InFlag.getNode())
2672    Ops.push_back(InFlag);
2673
2674  if (isTailCall) {
2675    // We used to do:
2676    //// If this is the first return lowered for this function, add the regs
2677    //// to the liveout set for the function.
2678    // This isn't right, although it's probably harmless on x86; liveouts
2679    // should be computed from returns not tail calls.  Consider a void
2680    // function making a tail call to a function returning int.
2681    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
2682  }
2683
2684  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2685  InFlag = Chain.getValue(1);
2686
2687  // Create the CALLSEQ_END node.
2688  unsigned NumBytesForCalleeToPush;
2689  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2690                       getTargetMachine().Options.GuaranteedTailCallOpt))
2691    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2692  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2693           SR == StackStructReturn)
2694    // If this is a call to a struct-return function, the callee
2695    // pops the hidden struct pointer, so we have to push it back.
2696    // This is common for Darwin/X86, Linux & Mingw32 targets.
2697    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2698    NumBytesForCalleeToPush = 4;
2699  else
2700    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2701
2702  // Returns a flag for retval copy to use.
2703  if (!IsSibcall) {
2704    Chain = DAG.getCALLSEQ_END(Chain,
2705                               DAG.getIntPtrConstant(NumBytes, true),
2706                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2707                                                     true),
2708                               InFlag, dl);
2709    InFlag = Chain.getValue(1);
2710  }
2711
2712  // Handle result values, copying them out of physregs into vregs that we
2713  // return.
2714  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2715                         Ins, dl, DAG, InVals);
2716}
2717
2718//===----------------------------------------------------------------------===//
2719//                Fast Calling Convention (tail call) implementation
2720//===----------------------------------------------------------------------===//
2721
2722//  Like std call, callee cleans arguments, convention except that ECX is
2723//  reserved for storing the tail called function address. Only 2 registers are
2724//  free for argument passing (inreg). Tail call optimization is performed
2725//  provided:
2726//                * tailcallopt is enabled
2727//                * caller/callee are fastcc
2728//  On X86_64 architecture with GOT-style position independent code only local
2729//  (within module) calls are supported at the moment.
2730//  To keep the stack aligned according to platform abi the function
2731//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2732//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2733//  If a tail called function callee has more arguments than the caller the
2734//  caller needs to make sure that there is room to move the RETADDR to. This is
2735//  achieved by reserving an area the size of the argument delta right after the
2736//  original REtADDR, but before the saved framepointer or the spilled registers
2737//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2738//  stack layout:
2739//    arg1
2740//    arg2
2741//    RETADDR
2742//    [ new RETADDR
2743//      move area ]
2744//    (possible EBP)
2745//    ESI
2746//    EDI
2747//    local1 ..
2748
2749/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2750/// for a 16 byte align requirement.
2751unsigned
2752X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2753                                               SelectionDAG& DAG) const {
2754  MachineFunction &MF = DAG.getMachineFunction();
2755  const TargetMachine &TM = MF.getTarget();
2756  const X86RegisterInfo *RegInfo =
2757    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
2758  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2759  unsigned StackAlignment = TFI.getStackAlignment();
2760  uint64_t AlignMask = StackAlignment - 1;
2761  int64_t Offset = StackSize;
2762  unsigned SlotSize = RegInfo->getSlotSize();
2763  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2764    // Number smaller than 12 so just add the difference.
2765    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2766  } else {
2767    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2768    Offset = ((~AlignMask) & Offset) + StackAlignment +
2769      (StackAlignment-SlotSize);
2770  }
2771  return Offset;
2772}
2773
2774/// MatchingStackOffset - Return true if the given stack call argument is
2775/// already available in the same position (relatively) of the caller's
2776/// incoming argument stack.
2777static
2778bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2779                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2780                         const X86InstrInfo *TII) {
2781  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2782  int FI = INT_MAX;
2783  if (Arg.getOpcode() == ISD::CopyFromReg) {
2784    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2785    if (!TargetRegisterInfo::isVirtualRegister(VR))
2786      return false;
2787    MachineInstr *Def = MRI->getVRegDef(VR);
2788    if (!Def)
2789      return false;
2790    if (!Flags.isByVal()) {
2791      if (!TII->isLoadFromStackSlot(Def, FI))
2792        return false;
2793    } else {
2794      unsigned Opcode = Def->getOpcode();
2795      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2796          Def->getOperand(1).isFI()) {
2797        FI = Def->getOperand(1).getIndex();
2798        Bytes = Flags.getByValSize();
2799      } else
2800        return false;
2801    }
2802  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2803    if (Flags.isByVal())
2804      // ByVal argument is passed in as a pointer but it's now being
2805      // dereferenced. e.g.
2806      // define @foo(%struct.X* %A) {
2807      //   tail call @bar(%struct.X* byval %A)
2808      // }
2809      return false;
2810    SDValue Ptr = Ld->getBasePtr();
2811    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2812    if (!FINode)
2813      return false;
2814    FI = FINode->getIndex();
2815  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2816    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2817    FI = FINode->getIndex();
2818    Bytes = Flags.getByValSize();
2819  } else
2820    return false;
2821
2822  assert(FI != INT_MAX);
2823  if (!MFI->isFixedObjectIndex(FI))
2824    return false;
2825  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2826}
2827
2828/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2829/// for tail call optimization. Targets which want to do tail call
2830/// optimization should implement this function.
2831bool
2832X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2833                                                     CallingConv::ID CalleeCC,
2834                                                     bool isVarArg,
2835                                                     bool isCalleeStructRet,
2836                                                     bool isCallerStructRet,
2837                                                     Type *RetTy,
2838                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2839                                    const SmallVectorImpl<SDValue> &OutVals,
2840                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2841                                                     SelectionDAG &DAG) const {
2842  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2843    return false;
2844
2845  // If -tailcallopt is specified, make fastcc functions tail-callable.
2846  const MachineFunction &MF = DAG.getMachineFunction();
2847  const Function *CallerF = MF.getFunction();
2848
2849  // If the function return type is x86_fp80 and the callee return type is not,
2850  // then the FP_EXTEND of the call result is not a nop. It's not safe to
2851  // perform a tailcall optimization here.
2852  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
2853    return false;
2854
2855  CallingConv::ID CallerCC = CallerF->getCallingConv();
2856  bool CCMatch = CallerCC == CalleeCC;
2857  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
2858  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
2859
2860  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2861    if (IsTailCallConvention(CalleeCC) && CCMatch)
2862      return true;
2863    return false;
2864  }
2865
2866  // Look for obvious safe cases to perform tail call optimization that do not
2867  // require ABI changes. This is what gcc calls sibcall.
2868
2869  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2870  // emit a special epilogue.
2871  const X86RegisterInfo *RegInfo =
2872    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
2873  if (RegInfo->needsStackRealignment(MF))
2874    return false;
2875
2876  // Also avoid sibcall optimization if either caller or callee uses struct
2877  // return semantics.
2878  if (isCalleeStructRet || isCallerStructRet)
2879    return false;
2880
2881  // An stdcall caller is expected to clean up its arguments; the callee
2882  // isn't going to do that.
2883  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
2884    return false;
2885
2886  // Do not sibcall optimize vararg calls unless all arguments are passed via
2887  // registers.
2888  if (isVarArg && !Outs.empty()) {
2889
2890    // Optimizing for varargs on Win64 is unlikely to be safe without
2891    // additional testing.
2892    if (IsCalleeWin64 || IsCallerWin64)
2893      return false;
2894
2895    SmallVector<CCValAssign, 16> ArgLocs;
2896    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2897                   getTargetMachine(), ArgLocs, *DAG.getContext());
2898
2899    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2900    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2901      if (!ArgLocs[i].isRegLoc())
2902        return false;
2903  }
2904
2905  // If the call result is in ST0 / ST1, it needs to be popped off the x87
2906  // stack.  Therefore, if it's not used by the call it is not safe to optimize
2907  // this into a sibcall.
2908  bool Unused = false;
2909  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2910    if (!Ins[i].Used) {
2911      Unused = true;
2912      break;
2913    }
2914  }
2915  if (Unused) {
2916    SmallVector<CCValAssign, 16> RVLocs;
2917    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
2918                   getTargetMachine(), RVLocs, *DAG.getContext());
2919    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2920    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2921      CCValAssign &VA = RVLocs[i];
2922      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2923        return false;
2924    }
2925  }
2926
2927  // If the calling conventions do not match, then we'd better make sure the
2928  // results are returned in the same way as what the caller expects.
2929  if (!CCMatch) {
2930    SmallVector<CCValAssign, 16> RVLocs1;
2931    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2932                    getTargetMachine(), RVLocs1, *DAG.getContext());
2933    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2934
2935    SmallVector<CCValAssign, 16> RVLocs2;
2936    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2937                    getTargetMachine(), RVLocs2, *DAG.getContext());
2938    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2939
2940    if (RVLocs1.size() != RVLocs2.size())
2941      return false;
2942    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2943      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2944        return false;
2945      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2946        return false;
2947      if (RVLocs1[i].isRegLoc()) {
2948        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2949          return false;
2950      } else {
2951        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2952          return false;
2953      }
2954    }
2955  }
2956
2957  // If the callee takes no arguments then go on to check the results of the
2958  // call.
2959  if (!Outs.empty()) {
2960    // Check if stack adjustment is needed. For now, do not do this if any
2961    // argument is passed on the stack.
2962    SmallVector<CCValAssign, 16> ArgLocs;
2963    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2964                   getTargetMachine(), ArgLocs, *DAG.getContext());
2965
2966    // Allocate shadow area for Win64
2967    if (IsCalleeWin64)
2968      CCInfo.AllocateStack(32, 8);
2969
2970    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2971    if (CCInfo.getNextStackOffset()) {
2972      MachineFunction &MF = DAG.getMachineFunction();
2973      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2974        return false;
2975
2976      // Check if the arguments are already laid out in the right way as
2977      // the caller's fixed stack objects.
2978      MachineFrameInfo *MFI = MF.getFrameInfo();
2979      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2980      const X86InstrInfo *TII =
2981        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
2982      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2983        CCValAssign &VA = ArgLocs[i];
2984        SDValue Arg = OutVals[i];
2985        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2986        if (VA.getLocInfo() == CCValAssign::Indirect)
2987          return false;
2988        if (!VA.isRegLoc()) {
2989          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2990                                   MFI, MRI, TII))
2991            return false;
2992        }
2993      }
2994    }
2995
2996    // If the tailcall address may be in a register, then make sure it's
2997    // possible to register allocate for it. In 32-bit, the call address can
2998    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2999    // callee-saved registers are restored. These happen to be the same
3000    // registers used to pass 'inreg' arguments so watch out for those.
3001    if (!Subtarget->is64Bit() &&
3002        ((!isa<GlobalAddressSDNode>(Callee) &&
3003          !isa<ExternalSymbolSDNode>(Callee)) ||
3004         getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
3005      unsigned NumInRegs = 0;
3006      // In PIC we need an extra register to formulate the address computation
3007      // for the callee.
3008      unsigned MaxInRegs =
3009          (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3010
3011      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3012        CCValAssign &VA = ArgLocs[i];
3013        if (!VA.isRegLoc())
3014          continue;
3015        unsigned Reg = VA.getLocReg();
3016        switch (Reg) {
3017        default: break;
3018        case X86::EAX: case X86::EDX: case X86::ECX:
3019          if (++NumInRegs == MaxInRegs)
3020            return false;
3021          break;
3022        }
3023      }
3024    }
3025  }
3026
3027  return true;
3028}
3029
3030FastISel *
3031X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3032                                  const TargetLibraryInfo *libInfo) const {
3033  return X86::createFastISel(funcInfo, libInfo);
3034}
3035
3036//===----------------------------------------------------------------------===//
3037//                           Other Lowering Hooks
3038//===----------------------------------------------------------------------===//
3039
3040static bool MayFoldLoad(SDValue Op) {
3041  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3042}
3043
3044static bool MayFoldIntoStore(SDValue Op) {
3045  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3046}
3047
3048static bool isTargetShuffle(unsigned Opcode) {
3049  switch(Opcode) {
3050  default: return false;
3051  case X86ISD::PSHUFD:
3052  case X86ISD::PSHUFHW:
3053  case X86ISD::PSHUFLW:
3054  case X86ISD::SHUFP:
3055  case X86ISD::PALIGNR:
3056  case X86ISD::MOVLHPS:
3057  case X86ISD::MOVLHPD:
3058  case X86ISD::MOVHLPS:
3059  case X86ISD::MOVLPS:
3060  case X86ISD::MOVLPD:
3061  case X86ISD::MOVSHDUP:
3062  case X86ISD::MOVSLDUP:
3063  case X86ISD::MOVDDUP:
3064  case X86ISD::MOVSS:
3065  case X86ISD::MOVSD:
3066  case X86ISD::UNPCKL:
3067  case X86ISD::UNPCKH:
3068  case X86ISD::VPERMILP:
3069  case X86ISD::VPERM2X128:
3070  case X86ISD::VPERMI:
3071    return true;
3072  }
3073}
3074
3075static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3076                                    SDValue V1, SelectionDAG &DAG) {
3077  switch(Opc) {
3078  default: llvm_unreachable("Unknown x86 shuffle node");
3079  case X86ISD::MOVSHDUP:
3080  case X86ISD::MOVSLDUP:
3081  case X86ISD::MOVDDUP:
3082    return DAG.getNode(Opc, dl, VT, V1);
3083  }
3084}
3085
3086static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3087                                    SDValue V1, unsigned TargetMask,
3088                                    SelectionDAG &DAG) {
3089  switch(Opc) {
3090  default: llvm_unreachable("Unknown x86 shuffle node");
3091  case X86ISD::PSHUFD:
3092  case X86ISD::PSHUFHW:
3093  case X86ISD::PSHUFLW:
3094  case X86ISD::VPERMILP:
3095  case X86ISD::VPERMI:
3096    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3097  }
3098}
3099
3100static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3101                                    SDValue V1, SDValue V2, unsigned TargetMask,
3102                                    SelectionDAG &DAG) {
3103  switch(Opc) {
3104  default: llvm_unreachable("Unknown x86 shuffle node");
3105  case X86ISD::PALIGNR:
3106  case X86ISD::SHUFP:
3107  case X86ISD::VPERM2X128:
3108    return DAG.getNode(Opc, dl, VT, V1, V2,
3109                       DAG.getConstant(TargetMask, MVT::i8));
3110  }
3111}
3112
3113static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3114                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
3115  switch(Opc) {
3116  default: llvm_unreachable("Unknown x86 shuffle node");
3117  case X86ISD::MOVLHPS:
3118  case X86ISD::MOVLHPD:
3119  case X86ISD::MOVHLPS:
3120  case X86ISD::MOVLPS:
3121  case X86ISD::MOVLPD:
3122  case X86ISD::MOVSS:
3123  case X86ISD::MOVSD:
3124  case X86ISD::UNPCKL:
3125  case X86ISD::UNPCKH:
3126    return DAG.getNode(Opc, dl, VT, V1, V2);
3127  }
3128}
3129
3130SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3131  MachineFunction &MF = DAG.getMachineFunction();
3132  const X86RegisterInfo *RegInfo =
3133    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
3134  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3135  int ReturnAddrIndex = FuncInfo->getRAIndex();
3136
3137  if (ReturnAddrIndex == 0) {
3138    // Set up a frame object for the return address.
3139    unsigned SlotSize = RegInfo->getSlotSize();
3140    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
3141                                                           false);
3142    FuncInfo->setRAIndex(ReturnAddrIndex);
3143  }
3144
3145  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3146}
3147
3148bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3149                                       bool hasSymbolicDisplacement) {
3150  // Offset should fit into 32 bit immediate field.
3151  if (!isInt<32>(Offset))
3152    return false;
3153
3154  // If we don't have a symbolic displacement - we don't have any extra
3155  // restrictions.
3156  if (!hasSymbolicDisplacement)
3157    return true;
3158
3159  // FIXME: Some tweaks might be needed for medium code model.
3160  if (M != CodeModel::Small && M != CodeModel::Kernel)
3161    return false;
3162
3163  // For small code model we assume that latest object is 16MB before end of 31
3164  // bits boundary. We may also accept pretty large negative constants knowing
3165  // that all objects are in the positive half of address space.
3166  if (M == CodeModel::Small && Offset < 16*1024*1024)
3167    return true;
3168
3169  // For kernel code model we know that all object resist in the negative half
3170  // of 32bits address space. We may not accept negative offsets, since they may
3171  // be just off and we may accept pretty large positive ones.
3172  if (M == CodeModel::Kernel && Offset > 0)
3173    return true;
3174
3175  return false;
3176}
3177
3178/// isCalleePop - Determines whether the callee is required to pop its
3179/// own arguments. Callee pop is necessary to support tail calls.
3180bool X86::isCalleePop(CallingConv::ID CallingConv,
3181                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3182  if (IsVarArg)
3183    return false;
3184
3185  switch (CallingConv) {
3186  default:
3187    return false;
3188  case CallingConv::X86_StdCall:
3189    return !is64Bit;
3190  case CallingConv::X86_FastCall:
3191    return !is64Bit;
3192  case CallingConv::X86_ThisCall:
3193    return !is64Bit;
3194  case CallingConv::Fast:
3195    return TailCallOpt;
3196  case CallingConv::GHC:
3197    return TailCallOpt;
3198  case CallingConv::HiPE:
3199    return TailCallOpt;
3200  }
3201}
3202
3203/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3204/// specific condition code, returning the condition code and the LHS/RHS of the
3205/// comparison to make.
3206static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3207                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3208  if (!isFP) {
3209    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3210      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3211        // X > -1   -> X == 0, jump !sign.
3212        RHS = DAG.getConstant(0, RHS.getValueType());
3213        return X86::COND_NS;
3214      }
3215      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3216        // X < 0   -> X == 0, jump on sign.
3217        return X86::COND_S;
3218      }
3219      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3220        // X < 1   -> X <= 0
3221        RHS = DAG.getConstant(0, RHS.getValueType());
3222        return X86::COND_LE;
3223      }
3224    }
3225
3226    switch (SetCCOpcode) {
3227    default: llvm_unreachable("Invalid integer condition!");
3228    case ISD::SETEQ:  return X86::COND_E;
3229    case ISD::SETGT:  return X86::COND_G;
3230    case ISD::SETGE:  return X86::COND_GE;
3231    case ISD::SETLT:  return X86::COND_L;
3232    case ISD::SETLE:  return X86::COND_LE;
3233    case ISD::SETNE:  return X86::COND_NE;
3234    case ISD::SETULT: return X86::COND_B;
3235    case ISD::SETUGT: return X86::COND_A;
3236    case ISD::SETULE: return X86::COND_BE;
3237    case ISD::SETUGE: return X86::COND_AE;
3238    }
3239  }
3240
3241  // First determine if it is required or is profitable to flip the operands.
3242
3243  // If LHS is a foldable load, but RHS is not, flip the condition.
3244  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3245      !ISD::isNON_EXTLoad(RHS.getNode())) {
3246    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3247    std::swap(LHS, RHS);
3248  }
3249
3250  switch (SetCCOpcode) {
3251  default: break;
3252  case ISD::SETOLT:
3253  case ISD::SETOLE:
3254  case ISD::SETUGT:
3255  case ISD::SETUGE:
3256    std::swap(LHS, RHS);
3257    break;
3258  }
3259
3260  // On a floating point condition, the flags are set as follows:
3261  // ZF  PF  CF   op
3262  //  0 | 0 | 0 | X > Y
3263  //  0 | 0 | 1 | X < Y
3264  //  1 | 0 | 0 | X == Y
3265  //  1 | 1 | 1 | unordered
3266  switch (SetCCOpcode) {
3267  default: llvm_unreachable("Condcode should be pre-legalized away");
3268  case ISD::SETUEQ:
3269  case ISD::SETEQ:   return X86::COND_E;
3270  case ISD::SETOLT:              // flipped
3271  case ISD::SETOGT:
3272  case ISD::SETGT:   return X86::COND_A;
3273  case ISD::SETOLE:              // flipped
3274  case ISD::SETOGE:
3275  case ISD::SETGE:   return X86::COND_AE;
3276  case ISD::SETUGT:              // flipped
3277  case ISD::SETULT:
3278  case ISD::SETLT:   return X86::COND_B;
3279  case ISD::SETUGE:              // flipped
3280  case ISD::SETULE:
3281  case ISD::SETLE:   return X86::COND_BE;
3282  case ISD::SETONE:
3283  case ISD::SETNE:   return X86::COND_NE;
3284  case ISD::SETUO:   return X86::COND_P;
3285  case ISD::SETO:    return X86::COND_NP;
3286  case ISD::SETOEQ:
3287  case ISD::SETUNE:  return X86::COND_INVALID;
3288  }
3289}
3290
3291/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3292/// code. Current x86 isa includes the following FP cmov instructions:
3293/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3294static bool hasFPCMov(unsigned X86CC) {
3295  switch (X86CC) {
3296  default:
3297    return false;
3298  case X86::COND_B:
3299  case X86::COND_BE:
3300  case X86::COND_E:
3301  case X86::COND_P:
3302  case X86::COND_A:
3303  case X86::COND_AE:
3304  case X86::COND_NE:
3305  case X86::COND_NP:
3306    return true;
3307  }
3308}
3309
3310/// isFPImmLegal - Returns true if the target can instruction select the
3311/// specified FP immediate natively. If false, the legalizer will
3312/// materialize the FP immediate as a load from a constant pool.
3313bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3314  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3315    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3316      return true;
3317  }
3318  return false;
3319}
3320
3321/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3322/// the specified range (L, H].
3323static bool isUndefOrInRange(int Val, int Low, int Hi) {
3324  return (Val < 0) || (Val >= Low && Val < Hi);
3325}
3326
3327/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3328/// specified value.
3329static bool isUndefOrEqual(int Val, int CmpVal) {
3330  return (Val < 0 || Val == CmpVal);
3331}
3332
3333/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3334/// from position Pos and ending in Pos+Size, falls within the specified
3335/// sequential range (L, L+Pos]. or is undef.
3336static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3337                                       unsigned Pos, unsigned Size, int Low) {
3338  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3339    if (!isUndefOrEqual(Mask[i], Low))
3340      return false;
3341  return true;
3342}
3343
3344/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3345/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3346/// the second operand.
3347static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
3348  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3349    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3350  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3351    return (Mask[0] < 2 && Mask[1] < 2);
3352  return false;
3353}
3354
3355/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3356/// is suitable for input to PSHUFHW.
3357static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3358  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3359    return false;
3360
3361  // Lower quadword copied in order or undef.
3362  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3363    return false;
3364
3365  // Upper quadword shuffled.
3366  for (unsigned i = 4; i != 8; ++i)
3367    if (!isUndefOrInRange(Mask[i], 4, 8))
3368      return false;
3369
3370  if (VT == MVT::v16i16) {
3371    // Lower quadword copied in order or undef.
3372    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3373      return false;
3374
3375    // Upper quadword shuffled.
3376    for (unsigned i = 12; i != 16; ++i)
3377      if (!isUndefOrInRange(Mask[i], 12, 16))
3378        return false;
3379  }
3380
3381  return true;
3382}
3383
3384/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3385/// is suitable for input to PSHUFLW.
3386static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3387  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3388    return false;
3389
3390  // Upper quadword copied in order.
3391  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3392    return false;
3393
3394  // Lower quadword shuffled.
3395  for (unsigned i = 0; i != 4; ++i)
3396    if (!isUndefOrInRange(Mask[i], 0, 4))
3397      return false;
3398
3399  if (VT == MVT::v16i16) {
3400    // Upper quadword copied in order.
3401    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3402      return false;
3403
3404    // Lower quadword shuffled.
3405    for (unsigned i = 8; i != 12; ++i)
3406      if (!isUndefOrInRange(Mask[i], 8, 12))
3407        return false;
3408  }
3409
3410  return true;
3411}
3412
3413/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3414/// is suitable for input to PALIGNR.
3415static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
3416                          const X86Subtarget *Subtarget) {
3417  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
3418      (VT.is256BitVector() && !Subtarget->hasInt256()))
3419    return false;
3420
3421  unsigned NumElts = VT.getVectorNumElements();
3422  unsigned NumLanes = VT.getSizeInBits()/128;
3423  unsigned NumLaneElts = NumElts/NumLanes;
3424
3425  // Do not handle 64-bit element shuffles with palignr.
3426  if (NumLaneElts == 2)
3427    return false;
3428
3429  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3430    unsigned i;
3431    for (i = 0; i != NumLaneElts; ++i) {
3432      if (Mask[i+l] >= 0)
3433        break;
3434    }
3435
3436    // Lane is all undef, go to next lane
3437    if (i == NumLaneElts)
3438      continue;
3439
3440    int Start = Mask[i+l];
3441
3442    // Make sure its in this lane in one of the sources
3443    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3444        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3445      return false;
3446
3447    // If not lane 0, then we must match lane 0
3448    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3449      return false;
3450
3451    // Correct second source to be contiguous with first source
3452    if (Start >= (int)NumElts)
3453      Start -= NumElts - NumLaneElts;
3454
3455    // Make sure we're shifting in the right direction.
3456    if (Start <= (int)(i+l))
3457      return false;
3458
3459    Start -= i;
3460
3461    // Check the rest of the elements to see if they are consecutive.
3462    for (++i; i != NumLaneElts; ++i) {
3463      int Idx = Mask[i+l];
3464
3465      // Make sure its in this lane
3466      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3467          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3468        return false;
3469
3470      // If not lane 0, then we must match lane 0
3471      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3472        return false;
3473
3474      if (Idx >= (int)NumElts)
3475        Idx -= NumElts - NumLaneElts;
3476
3477      if (!isUndefOrEqual(Idx, Start+i))
3478        return false;
3479
3480    }
3481  }
3482
3483  return true;
3484}
3485
3486/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3487/// the two vector operands have swapped position.
3488static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3489                                     unsigned NumElems) {
3490  for (unsigned i = 0; i != NumElems; ++i) {
3491    int idx = Mask[i];
3492    if (idx < 0)
3493      continue;
3494    else if (idx < (int)NumElems)
3495      Mask[i] = idx + NumElems;
3496    else
3497      Mask[i] = idx - NumElems;
3498  }
3499}
3500
3501/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3502/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3503/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3504/// reverse of what x86 shuffles want.
3505static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
3506                        bool Commuted = false) {
3507  if (!HasFp256 && VT.is256BitVector())
3508    return false;
3509
3510  unsigned NumElems = VT.getVectorNumElements();
3511  unsigned NumLanes = VT.getSizeInBits()/128;
3512  unsigned NumLaneElems = NumElems/NumLanes;
3513
3514  if (NumLaneElems != 2 && NumLaneElems != 4)
3515    return false;
3516
3517  // VSHUFPSY divides the resulting vector into 4 chunks.
3518  // The sources are also splitted into 4 chunks, and each destination
3519  // chunk must come from a different source chunk.
3520  //
3521  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3522  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3523  //
3524  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3525  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3526  //
3527  // VSHUFPDY divides the resulting vector into 4 chunks.
3528  // The sources are also splitted into 4 chunks, and each destination
3529  // chunk must come from a different source chunk.
3530  //
3531  //  SRC1 =>      X3       X2       X1       X0
3532  //  SRC2 =>      Y3       Y2       Y1       Y0
3533  //
3534  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3535  //
3536  unsigned HalfLaneElems = NumLaneElems/2;
3537  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3538    for (unsigned i = 0; i != NumLaneElems; ++i) {
3539      int Idx = Mask[i+l];
3540      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3541      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3542        return false;
3543      // For VSHUFPSY, the mask of the second half must be the same as the
3544      // first but with the appropriate offsets. This works in the same way as
3545      // VPERMILPS works with masks.
3546      if (NumElems != 8 || l == 0 || Mask[i] < 0)
3547        continue;
3548      if (!isUndefOrEqual(Idx, Mask[i]+l))
3549        return false;
3550    }
3551  }
3552
3553  return true;
3554}
3555
3556/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3557/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3558static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
3559  if (!VT.is128BitVector())
3560    return false;
3561
3562  unsigned NumElems = VT.getVectorNumElements();
3563
3564  if (NumElems != 4)
3565    return false;
3566
3567  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3568  return isUndefOrEqual(Mask[0], 6) &&
3569         isUndefOrEqual(Mask[1], 7) &&
3570         isUndefOrEqual(Mask[2], 2) &&
3571         isUndefOrEqual(Mask[3], 3);
3572}
3573
3574/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3575/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3576/// <2, 3, 2, 3>
3577static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
3578  if (!VT.is128BitVector())
3579    return false;
3580
3581  unsigned NumElems = VT.getVectorNumElements();
3582
3583  if (NumElems != 4)
3584    return false;
3585
3586  return isUndefOrEqual(Mask[0], 2) &&
3587         isUndefOrEqual(Mask[1], 3) &&
3588         isUndefOrEqual(Mask[2], 2) &&
3589         isUndefOrEqual(Mask[3], 3);
3590}
3591
3592/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3593/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3594static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
3595  if (!VT.is128BitVector())
3596    return false;
3597
3598  unsigned NumElems = VT.getVectorNumElements();
3599
3600  if (NumElems != 2 && NumElems != 4)
3601    return false;
3602
3603  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3604    if (!isUndefOrEqual(Mask[i], i + NumElems))
3605      return false;
3606
3607  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3608    if (!isUndefOrEqual(Mask[i], i))
3609      return false;
3610
3611  return true;
3612}
3613
3614/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3615/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3616static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
3617  if (!VT.is128BitVector())
3618    return false;
3619
3620  unsigned NumElems = VT.getVectorNumElements();
3621
3622  if (NumElems != 2 && NumElems != 4)
3623    return false;
3624
3625  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3626    if (!isUndefOrEqual(Mask[i], i))
3627      return false;
3628
3629  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3630    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3631      return false;
3632
3633  return true;
3634}
3635
3636//
3637// Some special combinations that can be optimized.
3638//
3639static
3640SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
3641                               SelectionDAG &DAG) {
3642  MVT VT = SVOp->getValueType(0).getSimpleVT();
3643  SDLoc dl(SVOp);
3644
3645  if (VT != MVT::v8i32 && VT != MVT::v8f32)
3646    return SDValue();
3647
3648  ArrayRef<int> Mask = SVOp->getMask();
3649
3650  // These are the special masks that may be optimized.
3651  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
3652  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
3653  bool MatchEvenMask = true;
3654  bool MatchOddMask  = true;
3655  for (int i=0; i<8; ++i) {
3656    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
3657      MatchEvenMask = false;
3658    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
3659      MatchOddMask = false;
3660  }
3661
3662  if (!MatchEvenMask && !MatchOddMask)
3663    return SDValue();
3664
3665  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
3666
3667  SDValue Op0 = SVOp->getOperand(0);
3668  SDValue Op1 = SVOp->getOperand(1);
3669
3670  if (MatchEvenMask) {
3671    // Shift the second operand right to 32 bits.
3672    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
3673    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
3674  } else {
3675    // Shift the first operand left to 32 bits.
3676    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
3677    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
3678  }
3679  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
3680  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
3681}
3682
3683/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3684/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3685static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
3686                         bool HasInt256, bool V2IsSplat = false) {
3687  unsigned NumElts = VT.getVectorNumElements();
3688
3689  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3690         "Unsupported vector type for unpckh");
3691
3692  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
3693      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3694    return false;
3695
3696  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3697  // independently on 128-bit lanes.
3698  unsigned NumLanes = VT.getSizeInBits()/128;
3699  unsigned NumLaneElts = NumElts/NumLanes;
3700
3701  for (unsigned l = 0; l != NumLanes; ++l) {
3702    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3703         i != (l+1)*NumLaneElts;
3704         i += 2, ++j) {
3705      int BitI  = Mask[i];
3706      int BitI1 = Mask[i+1];
3707      if (!isUndefOrEqual(BitI, j))
3708        return false;
3709      if (V2IsSplat) {
3710        if (!isUndefOrEqual(BitI1, NumElts))
3711          return false;
3712      } else {
3713        if (!isUndefOrEqual(BitI1, j + NumElts))
3714          return false;
3715      }
3716    }
3717  }
3718
3719  return true;
3720}
3721
3722/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3723/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3724static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
3725                         bool HasInt256, bool V2IsSplat = false) {
3726  unsigned NumElts = VT.getVectorNumElements();
3727
3728  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3729         "Unsupported vector type for unpckh");
3730
3731  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
3732      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3733    return false;
3734
3735  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3736  // independently on 128-bit lanes.
3737  unsigned NumLanes = VT.getSizeInBits()/128;
3738  unsigned NumLaneElts = NumElts/NumLanes;
3739
3740  for (unsigned l = 0; l != NumLanes; ++l) {
3741    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3742         i != (l+1)*NumLaneElts; i += 2, ++j) {
3743      int BitI  = Mask[i];
3744      int BitI1 = Mask[i+1];
3745      if (!isUndefOrEqual(BitI, j))
3746        return false;
3747      if (V2IsSplat) {
3748        if (isUndefOrEqual(BitI1, NumElts))
3749          return false;
3750      } else {
3751        if (!isUndefOrEqual(BitI1, j+NumElts))
3752          return false;
3753      }
3754    }
3755  }
3756  return true;
3757}
3758
3759/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3760/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3761/// <0, 0, 1, 1>
3762static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3763  unsigned NumElts = VT.getVectorNumElements();
3764  bool Is256BitVec = VT.is256BitVector();
3765
3766  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3767         "Unsupported vector type for unpckh");
3768
3769  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
3770      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3771    return false;
3772
3773  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
3774  // FIXME: Need a better way to get rid of this, there's no latency difference
3775  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
3776  // the former later. We should also remove the "_undef" special mask.
3777  if (NumElts == 4 && Is256BitVec)
3778    return false;
3779
3780  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3781  // independently on 128-bit lanes.
3782  unsigned NumLanes = VT.getSizeInBits()/128;
3783  unsigned NumLaneElts = NumElts/NumLanes;
3784
3785  for (unsigned l = 0; l != NumLanes; ++l) {
3786    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3787         i != (l+1)*NumLaneElts;
3788         i += 2, ++j) {
3789      int BitI  = Mask[i];
3790      int BitI1 = Mask[i+1];
3791
3792      if (!isUndefOrEqual(BitI, j))
3793        return false;
3794      if (!isUndefOrEqual(BitI1, j))
3795        return false;
3796    }
3797  }
3798
3799  return true;
3800}
3801
3802/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3803/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3804/// <2, 2, 3, 3>
3805static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3806  unsigned NumElts = VT.getVectorNumElements();
3807
3808  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3809         "Unsupported vector type for unpckh");
3810
3811  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
3812      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3813    return false;
3814
3815  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3816  // independently on 128-bit lanes.
3817  unsigned NumLanes = VT.getSizeInBits()/128;
3818  unsigned NumLaneElts = NumElts/NumLanes;
3819
3820  for (unsigned l = 0; l != NumLanes; ++l) {
3821    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3822         i != (l+1)*NumLaneElts; i += 2, ++j) {
3823      int BitI  = Mask[i];
3824      int BitI1 = Mask[i+1];
3825      if (!isUndefOrEqual(BitI, j))
3826        return false;
3827      if (!isUndefOrEqual(BitI1, j))
3828        return false;
3829    }
3830  }
3831  return true;
3832}
3833
3834/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3835/// specifies a shuffle of elements that is suitable for input to MOVSS,
3836/// MOVSD, and MOVD, i.e. setting the lowest element.
3837static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
3838  if (VT.getVectorElementType().getSizeInBits() < 32)
3839    return false;
3840  if (!VT.is128BitVector())
3841    return false;
3842
3843  unsigned NumElts = VT.getVectorNumElements();
3844
3845  if (!isUndefOrEqual(Mask[0], NumElts))
3846    return false;
3847
3848  for (unsigned i = 1; i != NumElts; ++i)
3849    if (!isUndefOrEqual(Mask[i], i))
3850      return false;
3851
3852  return true;
3853}
3854
3855/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
3856/// as permutations between 128-bit chunks or halves. As an example: this
3857/// shuffle bellow:
3858///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
3859/// The first half comes from the second half of V1 and the second half from the
3860/// the second half of V2.
3861static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3862  if (!HasFp256 || !VT.is256BitVector())
3863    return false;
3864
3865  // The shuffle result is divided into half A and half B. In total the two
3866  // sources have 4 halves, namely: C, D, E, F. The final values of A and
3867  // B must come from C, D, E or F.
3868  unsigned HalfSize = VT.getVectorNumElements()/2;
3869  bool MatchA = false, MatchB = false;
3870
3871  // Check if A comes from one of C, D, E, F.
3872  for (unsigned Half = 0; Half != 4; ++Half) {
3873    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
3874      MatchA = true;
3875      break;
3876    }
3877  }
3878
3879  // Check if B comes from one of C, D, E, F.
3880  for (unsigned Half = 0; Half != 4; ++Half) {
3881    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
3882      MatchB = true;
3883      break;
3884    }
3885  }
3886
3887  return MatchA && MatchB;
3888}
3889
3890/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
3891/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
3892static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
3893  MVT VT = SVOp->getValueType(0).getSimpleVT();
3894
3895  unsigned HalfSize = VT.getVectorNumElements()/2;
3896
3897  unsigned FstHalf = 0, SndHalf = 0;
3898  for (unsigned i = 0; i < HalfSize; ++i) {
3899    if (SVOp->getMaskElt(i) > 0) {
3900      FstHalf = SVOp->getMaskElt(i)/HalfSize;
3901      break;
3902    }
3903  }
3904  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
3905    if (SVOp->getMaskElt(i) > 0) {
3906      SndHalf = SVOp->getMaskElt(i)/HalfSize;
3907      break;
3908    }
3909  }
3910
3911  return (FstHalf | (SndHalf << 4));
3912}
3913
3914/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
3915/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
3916/// Note that VPERMIL mask matching is different depending whether theunderlying
3917/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3918/// to the same elements of the low, but to the higher half of the source.
3919/// In VPERMILPD the two lanes could be shuffled independently of each other
3920/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
3921static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3922  if (!HasFp256)
3923    return false;
3924
3925  unsigned NumElts = VT.getVectorNumElements();
3926  // Only match 256-bit with 32/64-bit types
3927  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
3928    return false;
3929
3930  unsigned NumLanes = VT.getSizeInBits()/128;
3931  unsigned LaneSize = NumElts/NumLanes;
3932  for (unsigned l = 0; l != NumElts; l += LaneSize) {
3933    for (unsigned i = 0; i != LaneSize; ++i) {
3934      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
3935        return false;
3936      if (NumElts != 8 || l == 0)
3937        continue;
3938      // VPERMILPS handling
3939      if (Mask[i] < 0)
3940        continue;
3941      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
3942        return false;
3943    }
3944  }
3945
3946  return true;
3947}
3948
3949/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
3950/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3951/// element of vector 2 and the other elements to come from vector 1 in order.
3952static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
3953                               bool V2IsSplat = false, bool V2IsUndef = false) {
3954  if (!VT.is128BitVector())
3955    return false;
3956
3957  unsigned NumOps = VT.getVectorNumElements();
3958  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3959    return false;
3960
3961  if (!isUndefOrEqual(Mask[0], 0))
3962    return false;
3963
3964  for (unsigned i = 1; i != NumOps; ++i)
3965    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3966          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3967          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3968      return false;
3969
3970  return true;
3971}
3972
3973/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3974/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3975/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
3976static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
3977                           const X86Subtarget *Subtarget) {
3978  if (!Subtarget->hasSSE3())
3979    return false;
3980
3981  unsigned NumElems = VT.getVectorNumElements();
3982
3983  if ((VT.is128BitVector() && NumElems != 4) ||
3984      (VT.is256BitVector() && NumElems != 8))
3985    return false;
3986
3987  // "i+1" is the value the indexed mask element must have
3988  for (unsigned i = 0; i != NumElems; i += 2)
3989    if (!isUndefOrEqual(Mask[i], i+1) ||
3990        !isUndefOrEqual(Mask[i+1], i+1))
3991      return false;
3992
3993  return true;
3994}
3995
3996/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3997/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3998/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
3999static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
4000                           const X86Subtarget *Subtarget) {
4001  if (!Subtarget->hasSSE3())
4002    return false;
4003
4004  unsigned NumElems = VT.getVectorNumElements();
4005
4006  if ((VT.is128BitVector() && NumElems != 4) ||
4007      (VT.is256BitVector() && NumElems != 8))
4008    return false;
4009
4010  // "i" is the value the indexed mask element must have
4011  for (unsigned i = 0; i != NumElems; i += 2)
4012    if (!isUndefOrEqual(Mask[i], i) ||
4013        !isUndefOrEqual(Mask[i+1], i))
4014      return false;
4015
4016  return true;
4017}
4018
4019/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4020/// specifies a shuffle of elements that is suitable for input to 256-bit
4021/// version of MOVDDUP.
4022static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
4023  if (!HasFp256 || !VT.is256BitVector())
4024    return false;
4025
4026  unsigned NumElts = VT.getVectorNumElements();
4027  if (NumElts != 4)
4028    return false;
4029
4030  for (unsigned i = 0; i != NumElts/2; ++i)
4031    if (!isUndefOrEqual(Mask[i], 0))
4032      return false;
4033  for (unsigned i = NumElts/2; i != NumElts; ++i)
4034    if (!isUndefOrEqual(Mask[i], NumElts/2))
4035      return false;
4036  return true;
4037}
4038
4039/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4040/// specifies a shuffle of elements that is suitable for input to 128-bit
4041/// version of MOVDDUP.
4042static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
4043  if (!VT.is128BitVector())
4044    return false;
4045
4046  unsigned e = VT.getVectorNumElements() / 2;
4047  for (unsigned i = 0; i != e; ++i)
4048    if (!isUndefOrEqual(Mask[i], i))
4049      return false;
4050  for (unsigned i = 0; i != e; ++i)
4051    if (!isUndefOrEqual(Mask[e+i], i))
4052      return false;
4053  return true;
4054}
4055
4056/// isVEXTRACTF128Index - Return true if the specified
4057/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4058/// suitable for input to VEXTRACTF128.
4059bool X86::isVEXTRACTF128Index(SDNode *N) {
4060  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4061    return false;
4062
4063  // The index should be aligned on a 128-bit boundary.
4064  uint64_t Index =
4065    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4066
4067  MVT VT = N->getValueType(0).getSimpleVT();
4068  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4069  bool Result = (Index * ElSize) % 128 == 0;
4070
4071  return Result;
4072}
4073
4074/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
4075/// operand specifies a subvector insert that is suitable for input to
4076/// VINSERTF128.
4077bool X86::isVINSERTF128Index(SDNode *N) {
4078  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4079    return false;
4080
4081  // The index should be aligned on a 128-bit boundary.
4082  uint64_t Index =
4083    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4084
4085  MVT VT = N->getValueType(0).getSimpleVT();
4086  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4087  bool Result = (Index * ElSize) % 128 == 0;
4088
4089  return Result;
4090}
4091
4092/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4093/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4094/// Handles 128-bit and 256-bit.
4095static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4096  MVT VT = N->getValueType(0).getSimpleVT();
4097
4098  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4099         "Unsupported vector type for PSHUF/SHUFP");
4100
4101  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4102  // independently on 128-bit lanes.
4103  unsigned NumElts = VT.getVectorNumElements();
4104  unsigned NumLanes = VT.getSizeInBits()/128;
4105  unsigned NumLaneElts = NumElts/NumLanes;
4106
4107  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
4108         "Only supports 2 or 4 elements per lane");
4109
4110  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
4111  unsigned Mask = 0;
4112  for (unsigned i = 0; i != NumElts; ++i) {
4113    int Elt = N->getMaskElt(i);
4114    if (Elt < 0) continue;
4115    Elt &= NumLaneElts - 1;
4116    unsigned ShAmt = (i << Shift) % 8;
4117    Mask |= Elt << ShAmt;
4118  }
4119
4120  return Mask;
4121}
4122
4123/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4124/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4125static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4126  MVT VT = N->getValueType(0).getSimpleVT();
4127
4128  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4129         "Unsupported vector type for PSHUFHW");
4130
4131  unsigned NumElts = VT.getVectorNumElements();
4132
4133  unsigned Mask = 0;
4134  for (unsigned l = 0; l != NumElts; l += 8) {
4135    // 8 nodes per lane, but we only care about the last 4.
4136    for (unsigned i = 0; i < 4; ++i) {
4137      int Elt = N->getMaskElt(l+i+4);
4138      if (Elt < 0) continue;
4139      Elt &= 0x3; // only 2-bits.
4140      Mask |= Elt << (i * 2);
4141    }
4142  }
4143
4144  return Mask;
4145}
4146
4147/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4148/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4149static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4150  MVT VT = N->getValueType(0).getSimpleVT();
4151
4152  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4153         "Unsupported vector type for PSHUFHW");
4154
4155  unsigned NumElts = VT.getVectorNumElements();
4156
4157  unsigned Mask = 0;
4158  for (unsigned l = 0; l != NumElts; l += 8) {
4159    // 8 nodes per lane, but we only care about the first 4.
4160    for (unsigned i = 0; i < 4; ++i) {
4161      int Elt = N->getMaskElt(l+i);
4162      if (Elt < 0) continue;
4163      Elt &= 0x3; // only 2-bits
4164      Mask |= Elt << (i * 2);
4165    }
4166  }
4167
4168  return Mask;
4169}
4170
4171/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
4172/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
4173static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4174  MVT VT = SVOp->getValueType(0).getSimpleVT();
4175  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
4176
4177  unsigned NumElts = VT.getVectorNumElements();
4178  unsigned NumLanes = VT.getSizeInBits()/128;
4179  unsigned NumLaneElts = NumElts/NumLanes;
4180
4181  int Val = 0;
4182  unsigned i;
4183  for (i = 0; i != NumElts; ++i) {
4184    Val = SVOp->getMaskElt(i);
4185    if (Val >= 0)
4186      break;
4187  }
4188  if (Val >= (int)NumElts)
4189    Val -= NumElts - NumLaneElts;
4190
4191  assert(Val - i > 0 && "PALIGNR imm should be positive");
4192  return (Val - i) * EltSize;
4193}
4194
4195/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
4196/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4197/// instructions.
4198unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
4199  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4200    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
4201
4202  uint64_t Index =
4203    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4204
4205  MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
4206  MVT ElVT = VecVT.getVectorElementType();
4207
4208  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4209  return Index / NumElemsPerChunk;
4210}
4211
4212/// getInsertVINSERTF128Immediate - Return the appropriate immediate
4213/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4214/// instructions.
4215unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
4216  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4217    llvm_unreachable("Illegal insert subvector for VINSERTF128");
4218
4219  uint64_t Index =
4220    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4221
4222  MVT VecVT = N->getValueType(0).getSimpleVT();
4223  MVT ElVT = VecVT.getVectorElementType();
4224
4225  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4226  return Index / NumElemsPerChunk;
4227}
4228
4229/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
4230/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
4231/// Handles 256-bit.
4232static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
4233  MVT VT = N->getValueType(0).getSimpleVT();
4234
4235  unsigned NumElts = VT.getVectorNumElements();
4236
4237  assert((VT.is256BitVector() && NumElts == 4) &&
4238         "Unsupported vector type for VPERMQ/VPERMPD");
4239
4240  unsigned Mask = 0;
4241  for (unsigned i = 0; i != NumElts; ++i) {
4242    int Elt = N->getMaskElt(i);
4243    if (Elt < 0)
4244      continue;
4245    Mask |= Elt << (i*2);
4246  }
4247
4248  return Mask;
4249}
4250/// isZeroNode - Returns true if Elt is a constant zero or a floating point
4251/// constant +0.0.
4252bool X86::isZeroNode(SDValue Elt) {
4253  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
4254    return CN->isNullValue();
4255  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
4256    return CFP->getValueAPF().isPosZero();
4257  return false;
4258}
4259
4260/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4261/// their permute mask.
4262static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
4263                                    SelectionDAG &DAG) {
4264  MVT VT = SVOp->getValueType(0).getSimpleVT();
4265  unsigned NumElems = VT.getVectorNumElements();
4266  SmallVector<int, 8> MaskVec;
4267
4268  for (unsigned i = 0; i != NumElems; ++i) {
4269    int Idx = SVOp->getMaskElt(i);
4270    if (Idx >= 0) {
4271      if (Idx < (int)NumElems)
4272        Idx += NumElems;
4273      else
4274        Idx -= NumElems;
4275    }
4276    MaskVec.push_back(Idx);
4277  }
4278  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
4279                              SVOp->getOperand(0), &MaskVec[0]);
4280}
4281
4282/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4283/// match movhlps. The lower half elements should come from upper half of
4284/// V1 (and in order), and the upper half elements should come from the upper
4285/// half of V2 (and in order).
4286static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
4287  if (!VT.is128BitVector())
4288    return false;
4289  if (VT.getVectorNumElements() != 4)
4290    return false;
4291  for (unsigned i = 0, e = 2; i != e; ++i)
4292    if (!isUndefOrEqual(Mask[i], i+2))
4293      return false;
4294  for (unsigned i = 2; i != 4; ++i)
4295    if (!isUndefOrEqual(Mask[i], i+4))
4296      return false;
4297  return true;
4298}
4299
4300/// isScalarLoadToVector - Returns true if the node is a scalar load that
4301/// is promoted to a vector. It also returns the LoadSDNode by reference if
4302/// required.
4303static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4304  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4305    return false;
4306  N = N->getOperand(0).getNode();
4307  if (!ISD::isNON_EXTLoad(N))
4308    return false;
4309  if (LD)
4310    *LD = cast<LoadSDNode>(N);
4311  return true;
4312}
4313
4314// Test whether the given value is a vector value which will be legalized
4315// into a load.
4316static bool WillBeConstantPoolLoad(SDNode *N) {
4317  if (N->getOpcode() != ISD::BUILD_VECTOR)
4318    return false;
4319
4320  // Check for any non-constant elements.
4321  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4322    switch (N->getOperand(i).getNode()->getOpcode()) {
4323    case ISD::UNDEF:
4324    case ISD::ConstantFP:
4325    case ISD::Constant:
4326      break;
4327    default:
4328      return false;
4329    }
4330
4331  // Vectors of all-zeros and all-ones are materialized with special
4332  // instructions rather than being loaded.
4333  return !ISD::isBuildVectorAllZeros(N) &&
4334         !ISD::isBuildVectorAllOnes(N);
4335}
4336
4337/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4338/// match movlp{s|d}. The lower half elements should come from lower half of
4339/// V1 (and in order), and the upper half elements should come from the upper
4340/// half of V2 (and in order). And since V1 will become the source of the
4341/// MOVLP, it must be either a vector load or a scalar load to vector.
4342static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
4343                               ArrayRef<int> Mask, EVT VT) {
4344  if (!VT.is128BitVector())
4345    return false;
4346
4347  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4348    return false;
4349  // Is V2 is a vector load, don't do this transformation. We will try to use
4350  // load folding shufps op.
4351  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
4352    return false;
4353
4354  unsigned NumElems = VT.getVectorNumElements();
4355
4356  if (NumElems != 2 && NumElems != 4)
4357    return false;
4358  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4359    if (!isUndefOrEqual(Mask[i], i))
4360      return false;
4361  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4362    if (!isUndefOrEqual(Mask[i], i+NumElems))
4363      return false;
4364  return true;
4365}
4366
4367/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4368/// all the same.
4369static bool isSplatVector(SDNode *N) {
4370  if (N->getOpcode() != ISD::BUILD_VECTOR)
4371    return false;
4372
4373  SDValue SplatValue = N->getOperand(0);
4374  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4375    if (N->getOperand(i) != SplatValue)
4376      return false;
4377  return true;
4378}
4379
4380/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4381/// to an zero vector.
4382/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4383static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4384  SDValue V1 = N->getOperand(0);
4385  SDValue V2 = N->getOperand(1);
4386  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4387  for (unsigned i = 0; i != NumElems; ++i) {
4388    int Idx = N->getMaskElt(i);
4389    if (Idx >= (int)NumElems) {
4390      unsigned Opc = V2.getOpcode();
4391      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4392        continue;
4393      if (Opc != ISD::BUILD_VECTOR ||
4394          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4395        return false;
4396    } else if (Idx >= 0) {
4397      unsigned Opc = V1.getOpcode();
4398      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4399        continue;
4400      if (Opc != ISD::BUILD_VECTOR ||
4401          !X86::isZeroNode(V1.getOperand(Idx)))
4402        return false;
4403    }
4404  }
4405  return true;
4406}
4407
4408/// getZeroVector - Returns a vector of specified type with all zero elements.
4409///
4410static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4411                             SelectionDAG &DAG, SDLoc dl) {
4412  assert(VT.isVector() && "Expected a vector type");
4413
4414  // Always build SSE zero vectors as <4 x i32> bitcasted
4415  // to their dest type. This ensures they get CSE'd.
4416  SDValue Vec;
4417  if (VT.is128BitVector()) {  // SSE
4418    if (Subtarget->hasSSE2()) {  // SSE2
4419      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4420      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4421    } else { // SSE1
4422      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4423      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4424    }
4425  } else if (VT.is256BitVector()) { // AVX
4426    if (Subtarget->hasInt256()) { // AVX2
4427      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4428      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4429      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
4430                        array_lengthof(Ops));
4431    } else {
4432      // 256-bit logic and arithmetic instructions in AVX are all
4433      // floating-point, no support for integer ops. Emit fp zeroed vectors.
4434      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4435      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4436      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
4437                        array_lengthof(Ops));
4438    }
4439  } else
4440    llvm_unreachable("Unexpected vector type");
4441
4442  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4443}
4444
4445/// getOnesVector - Returns a vector of specified type with all bits set.
4446/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4447/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4448/// Then bitcast to their original type, ensuring they get CSE'd.
4449static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
4450                             SDLoc dl) {
4451  assert(VT.isVector() && "Expected a vector type");
4452
4453  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4454  SDValue Vec;
4455  if (VT.is256BitVector()) {
4456    if (HasInt256) { // AVX2
4457      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4458      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
4459                        array_lengthof(Ops));
4460    } else { // AVX
4461      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4462      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4463    }
4464  } else if (VT.is128BitVector()) {
4465    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4466  } else
4467    llvm_unreachable("Unexpected vector type");
4468
4469  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4470}
4471
4472/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4473/// that point to V2 points to its first element.
4474static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4475  for (unsigned i = 0; i != NumElems; ++i) {
4476    if (Mask[i] > (int)NumElems) {
4477      Mask[i] = NumElems;
4478    }
4479  }
4480}
4481
4482/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4483/// operation of specified width.
4484static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
4485                       SDValue V2) {
4486  unsigned NumElems = VT.getVectorNumElements();
4487  SmallVector<int, 8> Mask;
4488  Mask.push_back(NumElems);
4489  for (unsigned i = 1; i != NumElems; ++i)
4490    Mask.push_back(i);
4491  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4492}
4493
4494/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4495static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
4496                          SDValue V2) {
4497  unsigned NumElems = VT.getVectorNumElements();
4498  SmallVector<int, 8> Mask;
4499  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4500    Mask.push_back(i);
4501    Mask.push_back(i + NumElems);
4502  }
4503  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4504}
4505
4506/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4507static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
4508                          SDValue V2) {
4509  unsigned NumElems = VT.getVectorNumElements();
4510  SmallVector<int, 8> Mask;
4511  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4512    Mask.push_back(i + Half);
4513    Mask.push_back(i + NumElems + Half);
4514  }
4515  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4516}
4517
4518// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4519// a generic shuffle instruction because the target has no such instructions.
4520// Generate shuffles which repeat i16 and i8 several times until they can be
4521// represented by v4f32 and then be manipulated by target suported shuffles.
4522static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4523  EVT VT = V.getValueType();
4524  int NumElems = VT.getVectorNumElements();
4525  SDLoc dl(V);
4526
4527  while (NumElems > 4) {
4528    if (EltNo < NumElems/2) {
4529      V = getUnpackl(DAG, dl, VT, V, V);
4530    } else {
4531      V = getUnpackh(DAG, dl, VT, V, V);
4532      EltNo -= NumElems/2;
4533    }
4534    NumElems >>= 1;
4535  }
4536  return V;
4537}
4538
4539/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4540static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4541  EVT VT = V.getValueType();
4542  SDLoc dl(V);
4543
4544  if (VT.is128BitVector()) {
4545    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4546    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4547    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4548                             &SplatMask[0]);
4549  } else if (VT.is256BitVector()) {
4550    // To use VPERMILPS to splat scalars, the second half of indicies must
4551    // refer to the higher part, which is a duplication of the lower one,
4552    // because VPERMILPS can only handle in-lane permutations.
4553    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4554                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4555
4556    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4557    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4558                             &SplatMask[0]);
4559  } else
4560    llvm_unreachable("Vector size not supported");
4561
4562  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4563}
4564
4565/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4566static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4567  EVT SrcVT = SV->getValueType(0);
4568  SDValue V1 = SV->getOperand(0);
4569  SDLoc dl(SV);
4570
4571  int EltNo = SV->getSplatIndex();
4572  int NumElems = SrcVT.getVectorNumElements();
4573  bool Is256BitVec = SrcVT.is256BitVector();
4574
4575  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
4576         "Unknown how to promote splat for type");
4577
4578  // Extract the 128-bit part containing the splat element and update
4579  // the splat element index when it refers to the higher register.
4580  if (Is256BitVec) {
4581    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4582    if (EltNo >= NumElems/2)
4583      EltNo -= NumElems/2;
4584  }
4585
4586  // All i16 and i8 vector types can't be used directly by a generic shuffle
4587  // instruction because the target has no such instruction. Generate shuffles
4588  // which repeat i16 and i8 several times until they fit in i32, and then can
4589  // be manipulated by target suported shuffles.
4590  EVT EltVT = SrcVT.getVectorElementType();
4591  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4592    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4593
4594  // Recreate the 256-bit vector and place the same 128-bit vector
4595  // into the low and high part. This is necessary because we want
4596  // to use VPERM* to shuffle the vectors
4597  if (Is256BitVec) {
4598    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4599  }
4600
4601  return getLegalSplat(DAG, V1, EltNo);
4602}
4603
4604/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4605/// vector of zero or undef vector.  This produces a shuffle where the low
4606/// element of V2 is swizzled into the zero/undef vector, landing at element
4607/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4608static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4609                                           bool IsZero,
4610                                           const X86Subtarget *Subtarget,
4611                                           SelectionDAG &DAG) {
4612  EVT VT = V2.getValueType();
4613  SDValue V1 = IsZero
4614    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4615  unsigned NumElems = VT.getVectorNumElements();
4616  SmallVector<int, 16> MaskVec;
4617  for (unsigned i = 0; i != NumElems; ++i)
4618    // If this is the insertion idx, put the low elt of V2 here.
4619    MaskVec.push_back(i == Idx ? NumElems : i);
4620  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
4621}
4622
4623/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4624/// target specific opcode. Returns true if the Mask could be calculated.
4625/// Sets IsUnary to true if only uses one source.
4626static bool getTargetShuffleMask(SDNode *N, MVT VT,
4627                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4628  unsigned NumElems = VT.getVectorNumElements();
4629  SDValue ImmN;
4630
4631  IsUnary = false;
4632  switch(N->getOpcode()) {
4633  case X86ISD::SHUFP:
4634    ImmN = N->getOperand(N->getNumOperands()-1);
4635    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4636    break;
4637  case X86ISD::UNPCKH:
4638    DecodeUNPCKHMask(VT, Mask);
4639    break;
4640  case X86ISD::UNPCKL:
4641    DecodeUNPCKLMask(VT, Mask);
4642    break;
4643  case X86ISD::MOVHLPS:
4644    DecodeMOVHLPSMask(NumElems, Mask);
4645    break;
4646  case X86ISD::MOVLHPS:
4647    DecodeMOVLHPSMask(NumElems, Mask);
4648    break;
4649  case X86ISD::PALIGNR:
4650    ImmN = N->getOperand(N->getNumOperands()-1);
4651    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4652    break;
4653  case X86ISD::PSHUFD:
4654  case X86ISD::VPERMILP:
4655    ImmN = N->getOperand(N->getNumOperands()-1);
4656    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4657    IsUnary = true;
4658    break;
4659  case X86ISD::PSHUFHW:
4660    ImmN = N->getOperand(N->getNumOperands()-1);
4661    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4662    IsUnary = true;
4663    break;
4664  case X86ISD::PSHUFLW:
4665    ImmN = N->getOperand(N->getNumOperands()-1);
4666    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4667    IsUnary = true;
4668    break;
4669  case X86ISD::VPERMI:
4670    ImmN = N->getOperand(N->getNumOperands()-1);
4671    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4672    IsUnary = true;
4673    break;
4674  case X86ISD::MOVSS:
4675  case X86ISD::MOVSD: {
4676    // The index 0 always comes from the first element of the second source,
4677    // this is why MOVSS and MOVSD are used in the first place. The other
4678    // elements come from the other positions of the first source vector
4679    Mask.push_back(NumElems);
4680    for (unsigned i = 1; i != NumElems; ++i) {
4681      Mask.push_back(i);
4682    }
4683    break;
4684  }
4685  case X86ISD::VPERM2X128:
4686    ImmN = N->getOperand(N->getNumOperands()-1);
4687    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4688    if (Mask.empty()) return false;
4689    break;
4690  case X86ISD::MOVDDUP:
4691  case X86ISD::MOVLHPD:
4692  case X86ISD::MOVLPD:
4693  case X86ISD::MOVLPS:
4694  case X86ISD::MOVSHDUP:
4695  case X86ISD::MOVSLDUP:
4696    // Not yet implemented
4697    return false;
4698  default: llvm_unreachable("unknown target shuffle node");
4699  }
4700
4701  return true;
4702}
4703
4704/// getShuffleScalarElt - Returns the scalar element that will make up the ith
4705/// element of the result of the vector shuffle.
4706static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
4707                                   unsigned Depth) {
4708  if (Depth == 6)
4709    return SDValue();  // Limit search depth.
4710
4711  SDValue V = SDValue(N, 0);
4712  EVT VT = V.getValueType();
4713  unsigned Opcode = V.getOpcode();
4714
4715  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4716  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4717    int Elt = SV->getMaskElt(Index);
4718
4719    if (Elt < 0)
4720      return DAG.getUNDEF(VT.getVectorElementType());
4721
4722    unsigned NumElems = VT.getVectorNumElements();
4723    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
4724                                         : SV->getOperand(1);
4725    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
4726  }
4727
4728  // Recurse into target specific vector shuffles to find scalars.
4729  if (isTargetShuffle(Opcode)) {
4730    MVT ShufVT = V.getValueType().getSimpleVT();
4731    unsigned NumElems = ShufVT.getVectorNumElements();
4732    SmallVector<int, 16> ShuffleMask;
4733    bool IsUnary;
4734
4735    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
4736      return SDValue();
4737
4738    int Elt = ShuffleMask[Index];
4739    if (Elt < 0)
4740      return DAG.getUNDEF(ShufVT.getVectorElementType());
4741
4742    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
4743                                         : N->getOperand(1);
4744    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
4745                               Depth+1);
4746  }
4747
4748  // Actual nodes that may contain scalar elements
4749  if (Opcode == ISD::BITCAST) {
4750    V = V.getOperand(0);
4751    EVT SrcVT = V.getValueType();
4752    unsigned NumElems = VT.getVectorNumElements();
4753
4754    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4755      return SDValue();
4756  }
4757
4758  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4759    return (Index == 0) ? V.getOperand(0)
4760                        : DAG.getUNDEF(VT.getVectorElementType());
4761
4762  if (V.getOpcode() == ISD::BUILD_VECTOR)
4763    return V.getOperand(Index);
4764
4765  return SDValue();
4766}
4767
4768/// getNumOfConsecutiveZeros - Return the number of elements of a vector
4769/// shuffle operation which come from a consecutively from a zero. The
4770/// search can start in two different directions, from left or right.
4771/// We count undefs as zeros until PreferredNum is reached.
4772static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
4773                                         unsigned NumElems, bool ZerosFromLeft,
4774                                         SelectionDAG &DAG,
4775                                         unsigned PreferredNum = -1U) {
4776  unsigned NumZeros = 0;
4777  for (unsigned i = 0; i != NumElems; ++i) {
4778    unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
4779    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
4780    if (!Elt.getNode())
4781      break;
4782
4783    if (X86::isZeroNode(Elt))
4784      ++NumZeros;
4785    else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
4786      NumZeros = std::min(NumZeros + 1, PreferredNum);
4787    else
4788      break;
4789  }
4790
4791  return NumZeros;
4792}
4793
4794/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
4795/// correspond consecutively to elements from one of the vector operands,
4796/// starting from its index OpIdx. Also tell OpNum which source vector operand.
4797static
4798bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
4799                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
4800                              unsigned NumElems, unsigned &OpNum) {
4801  bool SeenV1 = false;
4802  bool SeenV2 = false;
4803
4804  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
4805    int Idx = SVOp->getMaskElt(i);
4806    // Ignore undef indicies
4807    if (Idx < 0)
4808      continue;
4809
4810    if (Idx < (int)NumElems)
4811      SeenV1 = true;
4812    else
4813      SeenV2 = true;
4814
4815    // Only accept consecutive elements from the same vector
4816    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
4817      return false;
4818  }
4819
4820  OpNum = SeenV1 ? 0 : 1;
4821  return true;
4822}
4823
4824/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
4825/// logical left shift of a vector.
4826static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4827                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4828  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4829  unsigned NumZeros = getNumOfConsecutiveZeros(
4830      SVOp, NumElems, false /* check zeros from right */, DAG,
4831      SVOp->getMaskElt(0));
4832  unsigned OpSrc;
4833
4834  if (!NumZeros)
4835    return false;
4836
4837  // Considering the elements in the mask that are not consecutive zeros,
4838  // check if they consecutively come from only one of the source vectors.
4839  //
4840  //               V1 = {X, A, B, C}     0
4841  //                         \  \  \    /
4842  //   vector_shuffle V1, V2 <1, 2, 3, X>
4843  //
4844  if (!isShuffleMaskConsecutive(SVOp,
4845            0,                   // Mask Start Index
4846            NumElems-NumZeros,   // Mask End Index(exclusive)
4847            NumZeros,            // Where to start looking in the src vector
4848            NumElems,            // Number of elements in vector
4849            OpSrc))              // Which source operand ?
4850    return false;
4851
4852  isLeft = false;
4853  ShAmt = NumZeros;
4854  ShVal = SVOp->getOperand(OpSrc);
4855  return true;
4856}
4857
4858/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
4859/// logical left shift of a vector.
4860static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4861                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4862  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4863  unsigned NumZeros = getNumOfConsecutiveZeros(
4864      SVOp, NumElems, true /* check zeros from left */, DAG,
4865      NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
4866  unsigned OpSrc;
4867
4868  if (!NumZeros)
4869    return false;
4870
4871  // Considering the elements in the mask that are not consecutive zeros,
4872  // check if they consecutively come from only one of the source vectors.
4873  //
4874  //                           0    { A, B, X, X } = V2
4875  //                          / \    /  /
4876  //   vector_shuffle V1, V2 <X, X, 4, 5>
4877  //
4878  if (!isShuffleMaskConsecutive(SVOp,
4879            NumZeros,     // Mask Start Index
4880            NumElems,     // Mask End Index(exclusive)
4881            0,            // Where to start looking in the src vector
4882            NumElems,     // Number of elements in vector
4883            OpSrc))       // Which source operand ?
4884    return false;
4885
4886  isLeft = true;
4887  ShAmt = NumZeros;
4888  ShVal = SVOp->getOperand(OpSrc);
4889  return true;
4890}
4891
4892/// isVectorShift - Returns true if the shuffle can be implemented as a
4893/// logical left or right shift of a vector.
4894static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4895                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4896  // Although the logic below support any bitwidth size, there are no
4897  // shift instructions which handle more than 128-bit vectors.
4898  if (!SVOp->getValueType(0).is128BitVector())
4899    return false;
4900
4901  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
4902      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
4903    return true;
4904
4905  return false;
4906}
4907
4908/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4909///
4910static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4911                                       unsigned NumNonZero, unsigned NumZero,
4912                                       SelectionDAG &DAG,
4913                                       const X86Subtarget* Subtarget,
4914                                       const TargetLowering &TLI) {
4915  if (NumNonZero > 8)
4916    return SDValue();
4917
4918  SDLoc dl(Op);
4919  SDValue V(0, 0);
4920  bool First = true;
4921  for (unsigned i = 0; i < 16; ++i) {
4922    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4923    if (ThisIsNonZero && First) {
4924      if (NumZero)
4925        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4926      else
4927        V = DAG.getUNDEF(MVT::v8i16);
4928      First = false;
4929    }
4930
4931    if ((i & 1) != 0) {
4932      SDValue ThisElt(0, 0), LastElt(0, 0);
4933      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4934      if (LastIsNonZero) {
4935        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4936                              MVT::i16, Op.getOperand(i-1));
4937      }
4938      if (ThisIsNonZero) {
4939        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4940        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4941                              ThisElt, DAG.getConstant(8, MVT::i8));
4942        if (LastIsNonZero)
4943          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4944      } else
4945        ThisElt = LastElt;
4946
4947      if (ThisElt.getNode())
4948        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4949                        DAG.getIntPtrConstant(i/2));
4950    }
4951  }
4952
4953  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
4954}
4955
4956/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4957///
4958static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4959                                     unsigned NumNonZero, unsigned NumZero,
4960                                     SelectionDAG &DAG,
4961                                     const X86Subtarget* Subtarget,
4962                                     const TargetLowering &TLI) {
4963  if (NumNonZero > 4)
4964    return SDValue();
4965
4966  SDLoc dl(Op);
4967  SDValue V(0, 0);
4968  bool First = true;
4969  for (unsigned i = 0; i < 8; ++i) {
4970    bool isNonZero = (NonZeros & (1 << i)) != 0;
4971    if (isNonZero) {
4972      if (First) {
4973        if (NumZero)
4974          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4975        else
4976          V = DAG.getUNDEF(MVT::v8i16);
4977        First = false;
4978      }
4979      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4980                      MVT::v8i16, V, Op.getOperand(i),
4981                      DAG.getIntPtrConstant(i));
4982    }
4983  }
4984
4985  return V;
4986}
4987
4988/// getVShift - Return a vector logical shift node.
4989///
4990static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4991                         unsigned NumBits, SelectionDAG &DAG,
4992                         const TargetLowering &TLI, SDLoc dl) {
4993  assert(VT.is128BitVector() && "Unknown type for VShift");
4994  EVT ShVT = MVT::v2i64;
4995  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
4996  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
4997  return DAG.getNode(ISD::BITCAST, dl, VT,
4998                     DAG.getNode(Opc, dl, ShVT, SrcOp,
4999                             DAG.getConstant(NumBits,
5000                                  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
5001}
5002
5003SDValue
5004X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
5005                                          SelectionDAG &DAG) const {
5006
5007  // Check if the scalar load can be widened into a vector load. And if
5008  // the address is "base + cst" see if the cst can be "absorbed" into
5009  // the shuffle mask.
5010  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5011    SDValue Ptr = LD->getBasePtr();
5012    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5013      return SDValue();
5014    EVT PVT = LD->getValueType(0);
5015    if (PVT != MVT::i32 && PVT != MVT::f32)
5016      return SDValue();
5017
5018    int FI = -1;
5019    int64_t Offset = 0;
5020    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5021      FI = FINode->getIndex();
5022      Offset = 0;
5023    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5024               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5025      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5026      Offset = Ptr.getConstantOperandVal(1);
5027      Ptr = Ptr.getOperand(0);
5028    } else {
5029      return SDValue();
5030    }
5031
5032    // FIXME: 256-bit vector instructions don't require a strict alignment,
5033    // improve this code to support it better.
5034    unsigned RequiredAlign = VT.getSizeInBits()/8;
5035    SDValue Chain = LD->getChain();
5036    // Make sure the stack object alignment is at least 16 or 32.
5037    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5038    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5039      if (MFI->isFixedObjectIndex(FI)) {
5040        // Can't change the alignment. FIXME: It's possible to compute
5041        // the exact stack offset and reference FI + adjust offset instead.
5042        // If someone *really* cares about this. That's the way to implement it.
5043        return SDValue();
5044      } else {
5045        MFI->setObjectAlignment(FI, RequiredAlign);
5046      }
5047    }
5048
5049    // (Offset % 16 or 32) must be multiple of 4. Then address is then
5050    // Ptr + (Offset & ~15).
5051    if (Offset < 0)
5052      return SDValue();
5053    if ((Offset % RequiredAlign) & 3)
5054      return SDValue();
5055    int64_t StartOffset = Offset & ~(RequiredAlign-1);
5056    if (StartOffset)
5057      Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5058                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5059
5060    int EltNo = (Offset - StartOffset) >> 2;
5061    unsigned NumElems = VT.getVectorNumElements();
5062
5063    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5064    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5065                             LD->getPointerInfo().getWithOffset(StartOffset),
5066                             false, false, false, 0);
5067
5068    SmallVector<int, 8> Mask;
5069    for (unsigned i = 0; i != NumElems; ++i)
5070      Mask.push_back(EltNo);
5071
5072    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
5073  }
5074
5075  return SDValue();
5076}
5077
5078/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
5079/// vector of type 'VT', see if the elements can be replaced by a single large
5080/// load which has the same value as a build_vector whose operands are 'elts'.
5081///
5082/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
5083///
5084/// FIXME: we'd also like to handle the case where the last elements are zero
5085/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
5086/// There's even a handy isZeroNode for that purpose.
5087static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
5088                                        SDLoc &DL, SelectionDAG &DAG) {
5089  EVT EltVT = VT.getVectorElementType();
5090  unsigned NumElems = Elts.size();
5091
5092  LoadSDNode *LDBase = NULL;
5093  unsigned LastLoadedElt = -1U;
5094
5095  // For each element in the initializer, see if we've found a load or an undef.
5096  // If we don't find an initial load element, or later load elements are
5097  // non-consecutive, bail out.
5098  for (unsigned i = 0; i < NumElems; ++i) {
5099    SDValue Elt = Elts[i];
5100
5101    if (!Elt.getNode() ||
5102        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
5103      return SDValue();
5104    if (!LDBase) {
5105      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
5106        return SDValue();
5107      LDBase = cast<LoadSDNode>(Elt.getNode());
5108      LastLoadedElt = i;
5109      continue;
5110    }
5111    if (Elt.getOpcode() == ISD::UNDEF)
5112      continue;
5113
5114    LoadSDNode *LD = cast<LoadSDNode>(Elt);
5115    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
5116      return SDValue();
5117    LastLoadedElt = i;
5118  }
5119
5120  // If we have found an entire vector of loads and undefs, then return a large
5121  // load of the entire vector width starting at the base pointer.  If we found
5122  // consecutive loads for the low half, generate a vzext_load node.
5123  if (LastLoadedElt == NumElems - 1) {
5124    SDValue NewLd = SDValue();
5125    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
5126      NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5127                          LDBase->getPointerInfo(),
5128                          LDBase->isVolatile(), LDBase->isNonTemporal(),
5129                          LDBase->isInvariant(), 0);
5130    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5131                        LDBase->getPointerInfo(),
5132                        LDBase->isVolatile(), LDBase->isNonTemporal(),
5133                        LDBase->isInvariant(), LDBase->getAlignment());
5134
5135    if (LDBase->hasAnyUseOfValue(1)) {
5136      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5137                                     SDValue(LDBase, 1),
5138                                     SDValue(NewLd.getNode(), 1));
5139      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5140      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5141                             SDValue(NewLd.getNode(), 1));
5142    }
5143
5144    return NewLd;
5145  }
5146  if (NumElems == 4 && LastLoadedElt == 1 &&
5147      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
5148    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
5149    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5150    SDValue ResNode =
5151        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
5152                                array_lengthof(Ops), MVT::i64,
5153                                LDBase->getPointerInfo(),
5154                                LDBase->getAlignment(),
5155                                false/*isVolatile*/, true/*ReadMem*/,
5156                                false/*WriteMem*/);
5157
5158    // Make sure the newly-created LOAD is in the same position as LDBase in
5159    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
5160    // update uses of LDBase's output chain to use the TokenFactor.
5161    if (LDBase->hasAnyUseOfValue(1)) {
5162      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5163                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
5164      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5165      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5166                             SDValue(ResNode.getNode(), 1));
5167    }
5168
5169    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
5170  }
5171  return SDValue();
5172}
5173
5174/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
5175/// to generate a splat value for the following cases:
5176/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5177/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5178/// a scalar load, or a constant.
5179/// The VBROADCAST node is returned when a pattern is found,
5180/// or SDValue() otherwise.
5181SDValue
5182X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
5183  if (!Subtarget->hasFp256())
5184    return SDValue();
5185
5186  MVT VT = Op.getValueType().getSimpleVT();
5187  SDLoc dl(Op);
5188
5189  assert((VT.is128BitVector() || VT.is256BitVector()) &&
5190         "Unsupported vector type for broadcast.");
5191
5192  SDValue Ld;
5193  bool ConstSplatVal;
5194
5195  switch (Op.getOpcode()) {
5196    default:
5197      // Unknown pattern found.
5198      return SDValue();
5199
5200    case ISD::BUILD_VECTOR: {
5201      // The BUILD_VECTOR node must be a splat.
5202      if (!isSplatVector(Op.getNode()))
5203        return SDValue();
5204
5205      Ld = Op.getOperand(0);
5206      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5207                     Ld.getOpcode() == ISD::ConstantFP);
5208
5209      // The suspected load node has several users. Make sure that all
5210      // of its users are from the BUILD_VECTOR node.
5211      // Constants may have multiple users.
5212      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
5213        return SDValue();
5214      break;
5215    }
5216
5217    case ISD::VECTOR_SHUFFLE: {
5218      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5219
5220      // Shuffles must have a splat mask where the first element is
5221      // broadcasted.
5222      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5223        return SDValue();
5224
5225      SDValue Sc = Op.getOperand(0);
5226      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5227          Sc.getOpcode() != ISD::BUILD_VECTOR) {
5228
5229        if (!Subtarget->hasInt256())
5230          return SDValue();
5231
5232        // Use the register form of the broadcast instruction available on AVX2.
5233        if (VT.is256BitVector())
5234          Sc = Extract128BitVector(Sc, 0, DAG, dl);
5235        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5236      }
5237
5238      Ld = Sc.getOperand(0);
5239      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5240                       Ld.getOpcode() == ISD::ConstantFP);
5241
5242      // The scalar_to_vector node and the suspected
5243      // load node must have exactly one user.
5244      // Constants may have multiple users.
5245      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
5246        return SDValue();
5247      break;
5248    }
5249  }
5250
5251  bool Is256 = VT.is256BitVector();
5252
5253  // Handle the broadcasting a single constant scalar from the constant pool
5254  // into a vector. On Sandybridge it is still better to load a constant vector
5255  // from the constant pool and not to broadcast it from a scalar.
5256  if (ConstSplatVal && Subtarget->hasInt256()) {
5257    EVT CVT = Ld.getValueType();
5258    assert(!CVT.isVector() && "Must not broadcast a vector type");
5259    unsigned ScalarSize = CVT.getSizeInBits();
5260
5261    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
5262      const Constant *C = 0;
5263      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5264        C = CI->getConstantIntValue();
5265      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5266        C = CF->getConstantFPValue();
5267
5268      assert(C && "Invalid constant type");
5269
5270      SDValue CP = DAG.getConstantPool(C, getPointerTy());
5271      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5272      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5273                       MachinePointerInfo::getConstantPool(),
5274                       false, false, false, Alignment);
5275
5276      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5277    }
5278  }
5279
5280  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5281  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5282
5283  // Handle AVX2 in-register broadcasts.
5284  if (!IsLoad && Subtarget->hasInt256() &&
5285      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
5286    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5287
5288  // The scalar source must be a normal load.
5289  if (!IsLoad)
5290    return SDValue();
5291
5292  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
5293    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5294
5295  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5296  // double since there is no vbroadcastsd xmm
5297  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
5298    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5299      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5300  }
5301
5302  // Unsupported broadcast.
5303  return SDValue();
5304}
5305
5306SDValue
5307X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
5308  EVT VT = Op.getValueType();
5309
5310  // Skip if insert_vec_elt is not supported.
5311  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5312    return SDValue();
5313
5314  SDLoc DL(Op);
5315  unsigned NumElems = Op.getNumOperands();
5316
5317  SDValue VecIn1;
5318  SDValue VecIn2;
5319  SmallVector<unsigned, 4> InsertIndices;
5320  SmallVector<int, 8> Mask(NumElems, -1);
5321
5322  for (unsigned i = 0; i != NumElems; ++i) {
5323    unsigned Opc = Op.getOperand(i).getOpcode();
5324
5325    if (Opc == ISD::UNDEF)
5326      continue;
5327
5328    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5329      // Quit if more than 1 elements need inserting.
5330      if (InsertIndices.size() > 1)
5331        return SDValue();
5332
5333      InsertIndices.push_back(i);
5334      continue;
5335    }
5336
5337    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5338    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5339
5340    // Quit if extracted from vector of different type.
5341    if (ExtractedFromVec.getValueType() != VT)
5342      return SDValue();
5343
5344    // Quit if non-constant index.
5345    if (!isa<ConstantSDNode>(ExtIdx))
5346      return SDValue();
5347
5348    if (VecIn1.getNode() == 0)
5349      VecIn1 = ExtractedFromVec;
5350    else if (VecIn1 != ExtractedFromVec) {
5351      if (VecIn2.getNode() == 0)
5352        VecIn2 = ExtractedFromVec;
5353      else if (VecIn2 != ExtractedFromVec)
5354        // Quit if more than 2 vectors to shuffle
5355        return SDValue();
5356    }
5357
5358    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5359
5360    if (ExtractedFromVec == VecIn1)
5361      Mask[i] = Idx;
5362    else if (ExtractedFromVec == VecIn2)
5363      Mask[i] = Idx + NumElems;
5364  }
5365
5366  if (VecIn1.getNode() == 0)
5367    return SDValue();
5368
5369  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5370  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
5371  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5372    unsigned Idx = InsertIndices[i];
5373    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
5374                     DAG.getIntPtrConstant(Idx));
5375  }
5376
5377  return NV;
5378}
5379
5380SDValue
5381X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5382  SDLoc dl(Op);
5383
5384  MVT VT = Op.getValueType().getSimpleVT();
5385  MVT ExtVT = VT.getVectorElementType();
5386  unsigned NumElems = Op.getNumOperands();
5387
5388  // Vectors containing all zeros can be matched by pxor and xorps later
5389  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5390    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5391    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5392    if (VT == MVT::v4i32 || VT == MVT::v8i32)
5393      return Op;
5394
5395    return getZeroVector(VT, Subtarget, DAG, dl);
5396  }
5397
5398  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5399  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5400  // vpcmpeqd on 256-bit vectors.
5401  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
5402    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
5403      return Op;
5404
5405    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
5406  }
5407
5408  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
5409  if (Broadcast.getNode())
5410    return Broadcast;
5411
5412  unsigned EVTBits = ExtVT.getSizeInBits();
5413
5414  unsigned NumZero  = 0;
5415  unsigned NumNonZero = 0;
5416  unsigned NonZeros = 0;
5417  bool IsAllConstants = true;
5418  SmallSet<SDValue, 8> Values;
5419  for (unsigned i = 0; i < NumElems; ++i) {
5420    SDValue Elt = Op.getOperand(i);
5421    if (Elt.getOpcode() == ISD::UNDEF)
5422      continue;
5423    Values.insert(Elt);
5424    if (Elt.getOpcode() != ISD::Constant &&
5425        Elt.getOpcode() != ISD::ConstantFP)
5426      IsAllConstants = false;
5427    if (X86::isZeroNode(Elt))
5428      NumZero++;
5429    else {
5430      NonZeros |= (1 << i);
5431      NumNonZero++;
5432    }
5433  }
5434
5435  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5436  if (NumNonZero == 0)
5437    return DAG.getUNDEF(VT);
5438
5439  // Special case for single non-zero, non-undef, element.
5440  if (NumNonZero == 1) {
5441    unsigned Idx = countTrailingZeros(NonZeros);
5442    SDValue Item = Op.getOperand(Idx);
5443
5444    // If this is an insertion of an i64 value on x86-32, and if the top bits of
5445    // the value are obviously zero, truncate the value to i32 and do the
5446    // insertion that way.  Only do this if the value is non-constant or if the
5447    // value is a constant being inserted into element 0.  It is cheaper to do
5448    // a constant pool load than it is to do a movd + shuffle.
5449    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5450        (!IsAllConstants || Idx == 0)) {
5451      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5452        // Handle SSE only.
5453        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5454        EVT VecVT = MVT::v4i32;
5455        unsigned VecElts = 4;
5456
5457        // Truncate the value (which may itself be a constant) to i32, and
5458        // convert it to a vector with movd (S2V+shuffle to zero extend).
5459        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5460        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5461        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5462
5463        // Now we have our 32-bit value zero extended in the low element of
5464        // a vector.  If Idx != 0, swizzle it into place.
5465        if (Idx != 0) {
5466          SmallVector<int, 4> Mask;
5467          Mask.push_back(Idx);
5468          for (unsigned i = 1; i != VecElts; ++i)
5469            Mask.push_back(i);
5470          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5471                                      &Mask[0]);
5472        }
5473        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5474      }
5475    }
5476
5477    // If we have a constant or non-constant insertion into the low element of
5478    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5479    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5480    // depending on what the source datatype is.
5481    if (Idx == 0) {
5482      if (NumZero == 0)
5483        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5484
5485      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5486          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5487        if (VT.is256BitVector()) {
5488          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5489          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5490                             Item, DAG.getIntPtrConstant(0));
5491        }
5492        assert(VT.is128BitVector() && "Expected an SSE value type!");
5493        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5494        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5495        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5496      }
5497
5498      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5499        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5500        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5501        if (VT.is256BitVector()) {
5502          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5503          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5504        } else {
5505          assert(VT.is128BitVector() && "Expected an SSE value type!");
5506          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5507        }
5508        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5509      }
5510    }
5511
5512    // Is it a vector logical left shift?
5513    if (NumElems == 2 && Idx == 1 &&
5514        X86::isZeroNode(Op.getOperand(0)) &&
5515        !X86::isZeroNode(Op.getOperand(1))) {
5516      unsigned NumBits = VT.getSizeInBits();
5517      return getVShift(true, VT,
5518                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5519                                   VT, Op.getOperand(1)),
5520                       NumBits/2, DAG, *this, dl);
5521    }
5522
5523    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5524      return SDValue();
5525
5526    // Otherwise, if this is a vector with i32 or f32 elements, and the element
5527    // is a non-constant being inserted into an element other than the low one,
5528    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5529    // movd/movss) to move this into the low element, then shuffle it into
5530    // place.
5531    if (EVTBits == 32) {
5532      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5533
5534      // Turn it into a shuffle of zero and zero-extended scalar to vector.
5535      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5536      SmallVector<int, 8> MaskVec;
5537      for (unsigned i = 0; i != NumElems; ++i)
5538        MaskVec.push_back(i == Idx ? 0 : 1);
5539      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5540    }
5541  }
5542
5543  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5544  if (Values.size() == 1) {
5545    if (EVTBits == 32) {
5546      // Instead of a shuffle like this:
5547      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5548      // Check if it's possible to issue this instead.
5549      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5550      unsigned Idx = countTrailingZeros(NonZeros);
5551      SDValue Item = Op.getOperand(Idx);
5552      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5553        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5554    }
5555    return SDValue();
5556  }
5557
5558  // A vector full of immediates; various special cases are already
5559  // handled, so this is best done with a single constant-pool load.
5560  if (IsAllConstants)
5561    return SDValue();
5562
5563  // For AVX-length vectors, build the individual 128-bit pieces and use
5564  // shuffles to put them in place.
5565  if (VT.is256BitVector()) {
5566    SmallVector<SDValue, 32> V;
5567    for (unsigned i = 0; i != NumElems; ++i)
5568      V.push_back(Op.getOperand(i));
5569
5570    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5571
5572    // Build both the lower and upper subvector.
5573    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
5574    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
5575                                NumElems/2);
5576
5577    // Recreate the wider vector with the lower and upper part.
5578    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5579  }
5580
5581  // Let legalizer expand 2-wide build_vectors.
5582  if (EVTBits == 64) {
5583    if (NumNonZero == 1) {
5584      // One half is zero or undef.
5585      unsigned Idx = countTrailingZeros(NonZeros);
5586      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
5587                                 Op.getOperand(Idx));
5588      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
5589    }
5590    return SDValue();
5591  }
5592
5593  // If element VT is < 32 bits, convert it to inserts into a zero vector.
5594  if (EVTBits == 8 && NumElems == 16) {
5595    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
5596                                        Subtarget, *this);
5597    if (V.getNode()) return V;
5598  }
5599
5600  if (EVTBits == 16 && NumElems == 8) {
5601    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
5602                                      Subtarget, *this);
5603    if (V.getNode()) return V;
5604  }
5605
5606  // If element VT is == 32 bits, turn it into a number of shuffles.
5607  SmallVector<SDValue, 8> V(NumElems);
5608  if (NumElems == 4 && NumZero > 0) {
5609    for (unsigned i = 0; i < 4; ++i) {
5610      bool isZero = !(NonZeros & (1 << i));
5611      if (isZero)
5612        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
5613      else
5614        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5615    }
5616
5617    for (unsigned i = 0; i < 2; ++i) {
5618      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5619        default: break;
5620        case 0:
5621          V[i] = V[i*2];  // Must be a zero vector.
5622          break;
5623        case 1:
5624          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5625          break;
5626        case 2:
5627          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5628          break;
5629        case 3:
5630          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5631          break;
5632      }
5633    }
5634
5635    bool Reverse1 = (NonZeros & 0x3) == 2;
5636    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5637    int MaskVec[] = {
5638      Reverse1 ? 1 : 0,
5639      Reverse1 ? 0 : 1,
5640      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
5641      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
5642    };
5643    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
5644  }
5645
5646  if (Values.size() > 1 && VT.is128BitVector()) {
5647    // Check for a build vector of consecutive loads.
5648    for (unsigned i = 0; i < NumElems; ++i)
5649      V[i] = Op.getOperand(i);
5650
5651    // Check for elements which are consecutive loads.
5652    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
5653    if (LD.getNode())
5654      return LD;
5655
5656    // Check for a build vector from mostly shuffle plus few inserting.
5657    SDValue Sh = buildFromShuffleMostly(Op, DAG);
5658    if (Sh.getNode())
5659      return Sh;
5660
5661    // For SSE 4.1, use insertps to put the high elements into the low element.
5662    if (getSubtarget()->hasSSE41()) {
5663      SDValue Result;
5664      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
5665        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
5666      else
5667        Result = DAG.getUNDEF(VT);
5668
5669      for (unsigned i = 1; i < NumElems; ++i) {
5670        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
5671        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
5672                             Op.getOperand(i), DAG.getIntPtrConstant(i));
5673      }
5674      return Result;
5675    }
5676
5677    // Otherwise, expand into a number of unpckl*, start by extending each of
5678    // our (non-undef) elements to the full vector width with the element in the
5679    // bottom slot of the vector (which generates no code for SSE).
5680    for (unsigned i = 0; i < NumElems; ++i) {
5681      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
5682        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5683      else
5684        V[i] = DAG.getUNDEF(VT);
5685    }
5686
5687    // Next, we iteratively mix elements, e.g. for v4f32:
5688    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
5689    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
5690    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
5691    unsigned EltStride = NumElems >> 1;
5692    while (EltStride != 0) {
5693      for (unsigned i = 0; i < EltStride; ++i) {
5694        // If V[i+EltStride] is undef and this is the first round of mixing,
5695        // then it is safe to just drop this shuffle: V[i] is already in the
5696        // right place, the one element (since it's the first round) being
5697        // inserted as undef can be dropped.  This isn't safe for successive
5698        // rounds because they will permute elements within both vectors.
5699        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
5700            EltStride == NumElems/2)
5701          continue;
5702
5703        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
5704      }
5705      EltStride >>= 1;
5706    }
5707    return V[0];
5708  }
5709  return SDValue();
5710}
5711
5712// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
5713// to create 256-bit vectors from two other 128-bit ones.
5714static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5715  SDLoc dl(Op);
5716  MVT ResVT = Op.getValueType().getSimpleVT();
5717
5718  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
5719
5720  SDValue V1 = Op.getOperand(0);
5721  SDValue V2 = Op.getOperand(1);
5722  unsigned NumElems = ResVT.getVectorNumElements();
5723
5724  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
5725}
5726
5727static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5728  assert(Op.getNumOperands() == 2);
5729
5730  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
5731  // from two other 128-bit ones.
5732  return LowerAVXCONCAT_VECTORS(Op, DAG);
5733}
5734
5735// Try to lower a shuffle node into a simple blend instruction.
5736static SDValue
5737LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
5738                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
5739  SDValue V1 = SVOp->getOperand(0);
5740  SDValue V2 = SVOp->getOperand(1);
5741  SDLoc dl(SVOp);
5742  MVT VT = SVOp->getValueType(0).getSimpleVT();
5743  MVT EltVT = VT.getVectorElementType();
5744  unsigned NumElems = VT.getVectorNumElements();
5745
5746  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
5747    return SDValue();
5748  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
5749    return SDValue();
5750
5751  // Check the mask for BLEND and build the value.
5752  unsigned MaskValue = 0;
5753  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
5754  unsigned NumLanes = (NumElems-1)/8 + 1;
5755  unsigned NumElemsInLane = NumElems / NumLanes;
5756
5757  // Blend for v16i16 should be symetric for the both lanes.
5758  for (unsigned i = 0; i < NumElemsInLane; ++i) {
5759
5760    int SndLaneEltIdx = (NumLanes == 2) ?
5761      SVOp->getMaskElt(i + NumElemsInLane) : -1;
5762    int EltIdx = SVOp->getMaskElt(i);
5763
5764    if ((EltIdx < 0 || EltIdx == (int)i) &&
5765        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
5766      continue;
5767
5768    if (((unsigned)EltIdx == (i + NumElems)) &&
5769        (SndLaneEltIdx < 0 ||
5770         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
5771      MaskValue |= (1<<i);
5772    else
5773      return SDValue();
5774  }
5775
5776  // Convert i32 vectors to floating point if it is not AVX2.
5777  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
5778  MVT BlendVT = VT;
5779  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
5780    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
5781                               NumElems);
5782    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
5783    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
5784  }
5785
5786  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
5787                            DAG.getConstant(MaskValue, MVT::i32));
5788  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
5789}
5790
5791// v8i16 shuffles - Prefer shuffles in the following order:
5792// 1. [all]   pshuflw, pshufhw, optional move
5793// 2. [ssse3] 1 x pshufb
5794// 3. [ssse3] 2 x pshufb + 1 x por
5795// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
5796static SDValue
5797LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
5798                         SelectionDAG &DAG) {
5799  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5800  SDValue V1 = SVOp->getOperand(0);
5801  SDValue V2 = SVOp->getOperand(1);
5802  SDLoc dl(SVOp);
5803  SmallVector<int, 8> MaskVals;
5804
5805  // Determine if more than 1 of the words in each of the low and high quadwords
5806  // of the result come from the same quadword of one of the two inputs.  Undef
5807  // mask values count as coming from any quadword, for better codegen.
5808  unsigned LoQuad[] = { 0, 0, 0, 0 };
5809  unsigned HiQuad[] = { 0, 0, 0, 0 };
5810  std::bitset<4> InputQuads;
5811  for (unsigned i = 0; i < 8; ++i) {
5812    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
5813    int EltIdx = SVOp->getMaskElt(i);
5814    MaskVals.push_back(EltIdx);
5815    if (EltIdx < 0) {
5816      ++Quad[0];
5817      ++Quad[1];
5818      ++Quad[2];
5819      ++Quad[3];
5820      continue;
5821    }
5822    ++Quad[EltIdx / 4];
5823    InputQuads.set(EltIdx / 4);
5824  }
5825
5826  int BestLoQuad = -1;
5827  unsigned MaxQuad = 1;
5828  for (unsigned i = 0; i < 4; ++i) {
5829    if (LoQuad[i] > MaxQuad) {
5830      BestLoQuad = i;
5831      MaxQuad = LoQuad[i];
5832    }
5833  }
5834
5835  int BestHiQuad = -1;
5836  MaxQuad = 1;
5837  for (unsigned i = 0; i < 4; ++i) {
5838    if (HiQuad[i] > MaxQuad) {
5839      BestHiQuad = i;
5840      MaxQuad = HiQuad[i];
5841    }
5842  }
5843
5844  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
5845  // of the two input vectors, shuffle them into one input vector so only a
5846  // single pshufb instruction is necessary. If There are more than 2 input
5847  // quads, disable the next transformation since it does not help SSSE3.
5848  bool V1Used = InputQuads[0] || InputQuads[1];
5849  bool V2Used = InputQuads[2] || InputQuads[3];
5850  if (Subtarget->hasSSSE3()) {
5851    if (InputQuads.count() == 2 && V1Used && V2Used) {
5852      BestLoQuad = InputQuads[0] ? 0 : 1;
5853      BestHiQuad = InputQuads[2] ? 2 : 3;
5854    }
5855    if (InputQuads.count() > 2) {
5856      BestLoQuad = -1;
5857      BestHiQuad = -1;
5858    }
5859  }
5860
5861  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
5862  // the shuffle mask.  If a quad is scored as -1, that means that it contains
5863  // words from all 4 input quadwords.
5864  SDValue NewV;
5865  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
5866    int MaskV[] = {
5867      BestLoQuad < 0 ? 0 : BestLoQuad,
5868      BestHiQuad < 0 ? 1 : BestHiQuad
5869    };
5870    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
5871                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
5872                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
5873    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
5874
5875    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
5876    // source words for the shuffle, to aid later transformations.
5877    bool AllWordsInNewV = true;
5878    bool InOrder[2] = { true, true };
5879    for (unsigned i = 0; i != 8; ++i) {
5880      int idx = MaskVals[i];
5881      if (idx != (int)i)
5882        InOrder[i/4] = false;
5883      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
5884        continue;
5885      AllWordsInNewV = false;
5886      break;
5887    }
5888
5889    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
5890    if (AllWordsInNewV) {
5891      for (int i = 0; i != 8; ++i) {
5892        int idx = MaskVals[i];
5893        if (idx < 0)
5894          continue;
5895        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
5896        if ((idx != i) && idx < 4)
5897          pshufhw = false;
5898        if ((idx != i) && idx > 3)
5899          pshuflw = false;
5900      }
5901      V1 = NewV;
5902      V2Used = false;
5903      BestLoQuad = 0;
5904      BestHiQuad = 1;
5905    }
5906
5907    // If we've eliminated the use of V2, and the new mask is a pshuflw or
5908    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
5909    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
5910      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
5911      unsigned TargetMask = 0;
5912      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
5913                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
5914      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5915      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
5916                             getShufflePSHUFLWImmediate(SVOp);
5917      V1 = NewV.getOperand(0);
5918      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
5919    }
5920  }
5921
5922  // Promote splats to a larger type which usually leads to more efficient code.
5923  // FIXME: Is this true if pshufb is available?
5924  if (SVOp->isSplat())
5925    return PromoteSplat(SVOp, DAG);
5926
5927  // If we have SSSE3, and all words of the result are from 1 input vector,
5928  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
5929  // is present, fall back to case 4.
5930  if (Subtarget->hasSSSE3()) {
5931    SmallVector<SDValue,16> pshufbMask;
5932
5933    // If we have elements from both input vectors, set the high bit of the
5934    // shuffle mask element to zero out elements that come from V2 in the V1
5935    // mask, and elements that come from V1 in the V2 mask, so that the two
5936    // results can be OR'd together.
5937    bool TwoInputs = V1Used && V2Used;
5938    for (unsigned i = 0; i != 8; ++i) {
5939      int EltIdx = MaskVals[i] * 2;
5940      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
5941      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
5942      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
5943      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5944    }
5945    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
5946    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5947                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5948                                 MVT::v16i8, &pshufbMask[0], 16));
5949    if (!TwoInputs)
5950      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5951
5952    // Calculate the shuffle mask for the second input, shuffle it, and
5953    // OR it with the first shuffled input.
5954    pshufbMask.clear();
5955    for (unsigned i = 0; i != 8; ++i) {
5956      int EltIdx = MaskVals[i] * 2;
5957      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
5958      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
5959      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
5960      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5961    }
5962    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
5963    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5964                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5965                                 MVT::v16i8, &pshufbMask[0], 16));
5966    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5967    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5968  }
5969
5970  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
5971  // and update MaskVals with new element order.
5972  std::bitset<8> InOrder;
5973  if (BestLoQuad >= 0) {
5974    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
5975    for (int i = 0; i != 4; ++i) {
5976      int idx = MaskVals[i];
5977      if (idx < 0) {
5978        InOrder.set(i);
5979      } else if ((idx / 4) == BestLoQuad) {
5980        MaskV[i] = idx & 3;
5981        InOrder.set(i);
5982      }
5983    }
5984    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5985                                &MaskV[0]);
5986
5987    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5988      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5989      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
5990                                  NewV.getOperand(0),
5991                                  getShufflePSHUFLWImmediate(SVOp), DAG);
5992    }
5993  }
5994
5995  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
5996  // and update MaskVals with the new element order.
5997  if (BestHiQuad >= 0) {
5998    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
5999    for (unsigned i = 4; i != 8; ++i) {
6000      int idx = MaskVals[i];
6001      if (idx < 0) {
6002        InOrder.set(i);
6003      } else if ((idx / 4) == BestHiQuad) {
6004        MaskV[i] = (idx & 3) + 4;
6005        InOrder.set(i);
6006      }
6007    }
6008    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
6009                                &MaskV[0]);
6010
6011    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
6012      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6013      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
6014                                  NewV.getOperand(0),
6015                                  getShufflePSHUFHWImmediate(SVOp), DAG);
6016    }
6017  }
6018
6019  // In case BestHi & BestLo were both -1, which means each quadword has a word
6020  // from each of the four input quadwords, calculate the InOrder bitvector now
6021  // before falling through to the insert/extract cleanup.
6022  if (BestLoQuad == -1 && BestHiQuad == -1) {
6023    NewV = V1;
6024    for (int i = 0; i != 8; ++i)
6025      if (MaskVals[i] < 0 || MaskVals[i] == i)
6026        InOrder.set(i);
6027  }
6028
6029  // The other elements are put in the right place using pextrw and pinsrw.
6030  for (unsigned i = 0; i != 8; ++i) {
6031    if (InOrder[i])
6032      continue;
6033    int EltIdx = MaskVals[i];
6034    if (EltIdx < 0)
6035      continue;
6036    SDValue ExtOp = (EltIdx < 8) ?
6037      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
6038                  DAG.getIntPtrConstant(EltIdx)) :
6039      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
6040                  DAG.getIntPtrConstant(EltIdx - 8));
6041    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
6042                       DAG.getIntPtrConstant(i));
6043  }
6044  return NewV;
6045}
6046
6047// v16i8 shuffles - Prefer shuffles in the following order:
6048// 1. [ssse3] 1 x pshufb
6049// 2. [ssse3] 2 x pshufb + 1 x por
6050// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
6051static
6052SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
6053                                 SelectionDAG &DAG,
6054                                 const X86TargetLowering &TLI) {
6055  SDValue V1 = SVOp->getOperand(0);
6056  SDValue V2 = SVOp->getOperand(1);
6057  SDLoc dl(SVOp);
6058  ArrayRef<int> MaskVals = SVOp->getMask();
6059
6060  // Promote splats to a larger type which usually leads to more efficient code.
6061  // FIXME: Is this true if pshufb is available?
6062  if (SVOp->isSplat())
6063    return PromoteSplat(SVOp, DAG);
6064
6065  // If we have SSSE3, case 1 is generated when all result bytes come from
6066  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
6067  // present, fall back to case 3.
6068
6069  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
6070  if (TLI.getSubtarget()->hasSSSE3()) {
6071    SmallVector<SDValue,16> pshufbMask;
6072
6073    // If all result elements are from one input vector, then only translate
6074    // undef mask values to 0x80 (zero out result) in the pshufb mask.
6075    //
6076    // Otherwise, we have elements from both input vectors, and must zero out
6077    // elements that come from V2 in the first mask, and V1 in the second mask
6078    // so that we can OR them together.
6079    for (unsigned i = 0; i != 16; ++i) {
6080      int EltIdx = MaskVals[i];
6081      if (EltIdx < 0 || EltIdx >= 16)
6082        EltIdx = 0x80;
6083      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6084    }
6085    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
6086                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6087                                 MVT::v16i8, &pshufbMask[0], 16));
6088
6089    // As PSHUFB will zero elements with negative indices, it's safe to ignore
6090    // the 2nd operand if it's undefined or zero.
6091    if (V2.getOpcode() == ISD::UNDEF ||
6092        ISD::isBuildVectorAllZeros(V2.getNode()))
6093      return V1;
6094
6095    // Calculate the shuffle mask for the second input, shuffle it, and
6096    // OR it with the first shuffled input.
6097    pshufbMask.clear();
6098    for (unsigned i = 0; i != 16; ++i) {
6099      int EltIdx = MaskVals[i];
6100      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6101      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6102    }
6103    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6104                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6105                                 MVT::v16i8, &pshufbMask[0], 16));
6106    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6107  }
6108
6109  // No SSSE3 - Calculate in place words and then fix all out of place words
6110  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
6111  // the 16 different words that comprise the two doublequadword input vectors.
6112  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6113  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
6114  SDValue NewV = V1;
6115  for (int i = 0; i != 8; ++i) {
6116    int Elt0 = MaskVals[i*2];
6117    int Elt1 = MaskVals[i*2+1];
6118
6119    // This word of the result is all undef, skip it.
6120    if (Elt0 < 0 && Elt1 < 0)
6121      continue;
6122
6123    // This word of the result is already in the correct place, skip it.
6124    if ((Elt0 == i*2) && (Elt1 == i*2+1))
6125      continue;
6126
6127    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
6128    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
6129    SDValue InsElt;
6130
6131    // If Elt0 and Elt1 are defined, are consecutive, and can be load
6132    // using a single extract together, load it and store it.
6133    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
6134      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6135                           DAG.getIntPtrConstant(Elt1 / 2));
6136      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6137                        DAG.getIntPtrConstant(i));
6138      continue;
6139    }
6140
6141    // If Elt1 is defined, extract it from the appropriate source.  If the
6142    // source byte is not also odd, shift the extracted word left 8 bits
6143    // otherwise clear the bottom 8 bits if we need to do an or.
6144    if (Elt1 >= 0) {
6145      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6146                           DAG.getIntPtrConstant(Elt1 / 2));
6147      if ((Elt1 & 1) == 0)
6148        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
6149                             DAG.getConstant(8,
6150                                  TLI.getShiftAmountTy(InsElt.getValueType())));
6151      else if (Elt0 >= 0)
6152        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
6153                             DAG.getConstant(0xFF00, MVT::i16));
6154    }
6155    // If Elt0 is defined, extract it from the appropriate source.  If the
6156    // source byte is not also even, shift the extracted word right 8 bits. If
6157    // Elt1 was also defined, OR the extracted values together before
6158    // inserting them in the result.
6159    if (Elt0 >= 0) {
6160      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
6161                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
6162      if ((Elt0 & 1) != 0)
6163        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
6164                              DAG.getConstant(8,
6165                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
6166      else if (Elt1 >= 0)
6167        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
6168                             DAG.getConstant(0x00FF, MVT::i16));
6169      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
6170                         : InsElt0;
6171    }
6172    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6173                       DAG.getIntPtrConstant(i));
6174  }
6175  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
6176}
6177
6178// v32i8 shuffles - Translate to VPSHUFB if possible.
6179static
6180SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
6181                                 const X86Subtarget *Subtarget,
6182                                 SelectionDAG &DAG) {
6183  MVT VT = SVOp->getValueType(0).getSimpleVT();
6184  SDValue V1 = SVOp->getOperand(0);
6185  SDValue V2 = SVOp->getOperand(1);
6186  SDLoc dl(SVOp);
6187  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
6188
6189  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6190  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
6191  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
6192
6193  // VPSHUFB may be generated if
6194  // (1) one of input vector is undefined or zeroinitializer.
6195  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
6196  // And (2) the mask indexes don't cross the 128-bit lane.
6197  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
6198      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
6199    return SDValue();
6200
6201  if (V1IsAllZero && !V2IsAllZero) {
6202    CommuteVectorShuffleMask(MaskVals, 32);
6203    V1 = V2;
6204  }
6205  SmallVector<SDValue, 32> pshufbMask;
6206  for (unsigned i = 0; i != 32; i++) {
6207    int EltIdx = MaskVals[i];
6208    if (EltIdx < 0 || EltIdx >= 32)
6209      EltIdx = 0x80;
6210    else {
6211      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
6212        // Cross lane is not allowed.
6213        return SDValue();
6214      EltIdx &= 0xf;
6215    }
6216    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6217  }
6218  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
6219                      DAG.getNode(ISD::BUILD_VECTOR, dl,
6220                                  MVT::v32i8, &pshufbMask[0], 32));
6221}
6222
6223/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
6224/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
6225/// done when every pair / quad of shuffle mask elements point to elements in
6226/// the right sequence. e.g.
6227/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
6228static
6229SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
6230                                 SelectionDAG &DAG) {
6231  MVT VT = SVOp->getValueType(0).getSimpleVT();
6232  SDLoc dl(SVOp);
6233  unsigned NumElems = VT.getVectorNumElements();
6234  MVT NewVT;
6235  unsigned Scale;
6236  switch (VT.SimpleTy) {
6237  default: llvm_unreachable("Unexpected!");
6238  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
6239  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
6240  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
6241  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
6242  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
6243  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
6244  }
6245
6246  SmallVector<int, 8> MaskVec;
6247  for (unsigned i = 0; i != NumElems; i += Scale) {
6248    int StartIdx = -1;
6249    for (unsigned j = 0; j != Scale; ++j) {
6250      int EltIdx = SVOp->getMaskElt(i+j);
6251      if (EltIdx < 0)
6252        continue;
6253      if (StartIdx < 0)
6254        StartIdx = (EltIdx / Scale);
6255      if (EltIdx != (int)(StartIdx*Scale + j))
6256        return SDValue();
6257    }
6258    MaskVec.push_back(StartIdx);
6259  }
6260
6261  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
6262  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
6263  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
6264}
6265
6266/// getVZextMovL - Return a zero-extending vector move low node.
6267///
6268static SDValue getVZextMovL(MVT VT, EVT OpVT,
6269                            SDValue SrcOp, SelectionDAG &DAG,
6270                            const X86Subtarget *Subtarget, SDLoc dl) {
6271  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
6272    LoadSDNode *LD = NULL;
6273    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
6274      LD = dyn_cast<LoadSDNode>(SrcOp);
6275    if (!LD) {
6276      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
6277      // instead.
6278      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
6279      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
6280          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6281          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
6282          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
6283        // PR2108
6284        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
6285        return DAG.getNode(ISD::BITCAST, dl, VT,
6286                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6287                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6288                                                   OpVT,
6289                                                   SrcOp.getOperand(0)
6290                                                          .getOperand(0))));
6291      }
6292    }
6293  }
6294
6295  return DAG.getNode(ISD::BITCAST, dl, VT,
6296                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6297                                 DAG.getNode(ISD::BITCAST, dl,
6298                                             OpVT, SrcOp)));
6299}
6300
6301/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
6302/// which could not be matched by any known target speficic shuffle
6303static SDValue
6304LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6305
6306  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
6307  if (NewOp.getNode())
6308    return NewOp;
6309
6310  MVT VT = SVOp->getValueType(0).getSimpleVT();
6311
6312  unsigned NumElems = VT.getVectorNumElements();
6313  unsigned NumLaneElems = NumElems / 2;
6314
6315  SDLoc dl(SVOp);
6316  MVT EltVT = VT.getVectorElementType();
6317  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
6318  SDValue Output[2];
6319
6320  SmallVector<int, 16> Mask;
6321  for (unsigned l = 0; l < 2; ++l) {
6322    // Build a shuffle mask for the output, discovering on the fly which
6323    // input vectors to use as shuffle operands (recorded in InputUsed).
6324    // If building a suitable shuffle vector proves too hard, then bail
6325    // out with UseBuildVector set.
6326    bool UseBuildVector = false;
6327    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
6328    unsigned LaneStart = l * NumLaneElems;
6329    for (unsigned i = 0; i != NumLaneElems; ++i) {
6330      // The mask element.  This indexes into the input.
6331      int Idx = SVOp->getMaskElt(i+LaneStart);
6332      if (Idx < 0) {
6333        // the mask element does not index into any input vector.
6334        Mask.push_back(-1);
6335        continue;
6336      }
6337
6338      // The input vector this mask element indexes into.
6339      int Input = Idx / NumLaneElems;
6340
6341      // Turn the index into an offset from the start of the input vector.
6342      Idx -= Input * NumLaneElems;
6343
6344      // Find or create a shuffle vector operand to hold this input.
6345      unsigned OpNo;
6346      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6347        if (InputUsed[OpNo] == Input)
6348          // This input vector is already an operand.
6349          break;
6350        if (InputUsed[OpNo] < 0) {
6351          // Create a new operand for this input vector.
6352          InputUsed[OpNo] = Input;
6353          break;
6354        }
6355      }
6356
6357      if (OpNo >= array_lengthof(InputUsed)) {
6358        // More than two input vectors used!  Give up on trying to create a
6359        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
6360        UseBuildVector = true;
6361        break;
6362      }
6363
6364      // Add the mask index for the new shuffle vector.
6365      Mask.push_back(Idx + OpNo * NumLaneElems);
6366    }
6367
6368    if (UseBuildVector) {
6369      SmallVector<SDValue, 16> SVOps;
6370      for (unsigned i = 0; i != NumLaneElems; ++i) {
6371        // The mask element.  This indexes into the input.
6372        int Idx = SVOp->getMaskElt(i+LaneStart);
6373        if (Idx < 0) {
6374          SVOps.push_back(DAG.getUNDEF(EltVT));
6375          continue;
6376        }
6377
6378        // The input vector this mask element indexes into.
6379        int Input = Idx / NumElems;
6380
6381        // Turn the index into an offset from the start of the input vector.
6382        Idx -= Input * NumElems;
6383
6384        // Extract the vector element by hand.
6385        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6386                                    SVOp->getOperand(Input),
6387                                    DAG.getIntPtrConstant(Idx)));
6388      }
6389
6390      // Construct the output using a BUILD_VECTOR.
6391      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
6392                              SVOps.size());
6393    } else if (InputUsed[0] < 0) {
6394      // No input vectors were used! The result is undefined.
6395      Output[l] = DAG.getUNDEF(NVT);
6396    } else {
6397      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6398                                        (InputUsed[0] % 2) * NumLaneElems,
6399                                        DAG, dl);
6400      // If only one input was used, use an undefined vector for the other.
6401      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6402        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6403                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6404      // At least one input vector was used. Create a new shuffle vector.
6405      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6406    }
6407
6408    Mask.clear();
6409  }
6410
6411  // Concatenate the result back
6412  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
6413}
6414
6415/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6416/// 4 elements, and match them with several different shuffle types.
6417static SDValue
6418LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6419  SDValue V1 = SVOp->getOperand(0);
6420  SDValue V2 = SVOp->getOperand(1);
6421  SDLoc dl(SVOp);
6422  MVT VT = SVOp->getValueType(0).getSimpleVT();
6423
6424  assert(VT.is128BitVector() && "Unsupported vector size");
6425
6426  std::pair<int, int> Locs[4];
6427  int Mask1[] = { -1, -1, -1, -1 };
6428  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6429
6430  unsigned NumHi = 0;
6431  unsigned NumLo = 0;
6432  for (unsigned i = 0; i != 4; ++i) {
6433    int Idx = PermMask[i];
6434    if (Idx < 0) {
6435      Locs[i] = std::make_pair(-1, -1);
6436    } else {
6437      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6438      if (Idx < 4) {
6439        Locs[i] = std::make_pair(0, NumLo);
6440        Mask1[NumLo] = Idx;
6441        NumLo++;
6442      } else {
6443        Locs[i] = std::make_pair(1, NumHi);
6444        if (2+NumHi < 4)
6445          Mask1[2+NumHi] = Idx;
6446        NumHi++;
6447      }
6448    }
6449  }
6450
6451  if (NumLo <= 2 && NumHi <= 2) {
6452    // If no more than two elements come from either vector. This can be
6453    // implemented with two shuffles. First shuffle gather the elements.
6454    // The second shuffle, which takes the first shuffle as both of its
6455    // vector operands, put the elements into the right order.
6456    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6457
6458    int Mask2[] = { -1, -1, -1, -1 };
6459
6460    for (unsigned i = 0; i != 4; ++i)
6461      if (Locs[i].first != -1) {
6462        unsigned Idx = (i < 2) ? 0 : 4;
6463        Idx += Locs[i].first * 2 + Locs[i].second;
6464        Mask2[i] = Idx;
6465      }
6466
6467    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6468  }
6469
6470  if (NumLo == 3 || NumHi == 3) {
6471    // Otherwise, we must have three elements from one vector, call it X, and
6472    // one element from the other, call it Y.  First, use a shufps to build an
6473    // intermediate vector with the one element from Y and the element from X
6474    // that will be in the same half in the final destination (the indexes don't
6475    // matter). Then, use a shufps to build the final vector, taking the half
6476    // containing the element from Y from the intermediate, and the other half
6477    // from X.
6478    if (NumHi == 3) {
6479      // Normalize it so the 3 elements come from V1.
6480      CommuteVectorShuffleMask(PermMask, 4);
6481      std::swap(V1, V2);
6482    }
6483
6484    // Find the element from V2.
6485    unsigned HiIndex;
6486    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6487      int Val = PermMask[HiIndex];
6488      if (Val < 0)
6489        continue;
6490      if (Val >= 4)
6491        break;
6492    }
6493
6494    Mask1[0] = PermMask[HiIndex];
6495    Mask1[1] = -1;
6496    Mask1[2] = PermMask[HiIndex^1];
6497    Mask1[3] = -1;
6498    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6499
6500    if (HiIndex >= 2) {
6501      Mask1[0] = PermMask[0];
6502      Mask1[1] = PermMask[1];
6503      Mask1[2] = HiIndex & 1 ? 6 : 4;
6504      Mask1[3] = HiIndex & 1 ? 4 : 6;
6505      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6506    }
6507
6508    Mask1[0] = HiIndex & 1 ? 2 : 0;
6509    Mask1[1] = HiIndex & 1 ? 0 : 2;
6510    Mask1[2] = PermMask[2];
6511    Mask1[3] = PermMask[3];
6512    if (Mask1[2] >= 0)
6513      Mask1[2] += 4;
6514    if (Mask1[3] >= 0)
6515      Mask1[3] += 4;
6516    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6517  }
6518
6519  // Break it into (shuffle shuffle_hi, shuffle_lo).
6520  int LoMask[] = { -1, -1, -1, -1 };
6521  int HiMask[] = { -1, -1, -1, -1 };
6522
6523  int *MaskPtr = LoMask;
6524  unsigned MaskIdx = 0;
6525  unsigned LoIdx = 0;
6526  unsigned HiIdx = 2;
6527  for (unsigned i = 0; i != 4; ++i) {
6528    if (i == 2) {
6529      MaskPtr = HiMask;
6530      MaskIdx = 1;
6531      LoIdx = 0;
6532      HiIdx = 2;
6533    }
6534    int Idx = PermMask[i];
6535    if (Idx < 0) {
6536      Locs[i] = std::make_pair(-1, -1);
6537    } else if (Idx < 4) {
6538      Locs[i] = std::make_pair(MaskIdx, LoIdx);
6539      MaskPtr[LoIdx] = Idx;
6540      LoIdx++;
6541    } else {
6542      Locs[i] = std::make_pair(MaskIdx, HiIdx);
6543      MaskPtr[HiIdx] = Idx;
6544      HiIdx++;
6545    }
6546  }
6547
6548  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6549  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6550  int MaskOps[] = { -1, -1, -1, -1 };
6551  for (unsigned i = 0; i != 4; ++i)
6552    if (Locs[i].first != -1)
6553      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6554  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6555}
6556
6557static bool MayFoldVectorLoad(SDValue V) {
6558  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6559    V = V.getOperand(0);
6560
6561  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6562    V = V.getOperand(0);
6563  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6564      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6565    // BUILD_VECTOR (load), undef
6566    V = V.getOperand(0);
6567
6568  return MayFoldLoad(V);
6569}
6570
6571static
6572SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
6573  EVT VT = Op.getValueType();
6574
6575  // Canonizalize to v2f64.
6576  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
6577  return DAG.getNode(ISD::BITCAST, dl, VT,
6578                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
6579                                          V1, DAG));
6580}
6581
6582static
6583SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
6584                        bool HasSSE2) {
6585  SDValue V1 = Op.getOperand(0);
6586  SDValue V2 = Op.getOperand(1);
6587  EVT VT = Op.getValueType();
6588
6589  assert(VT != MVT::v2i64 && "unsupported shuffle type");
6590
6591  if (HasSSE2 && VT == MVT::v2f64)
6592    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
6593
6594  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
6595  return DAG.getNode(ISD::BITCAST, dl, VT,
6596                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
6597                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
6598                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
6599}
6600
6601static
6602SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
6603  SDValue V1 = Op.getOperand(0);
6604  SDValue V2 = Op.getOperand(1);
6605  EVT VT = Op.getValueType();
6606
6607  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
6608         "unsupported shuffle type");
6609
6610  if (V2.getOpcode() == ISD::UNDEF)
6611    V2 = V1;
6612
6613  // v4i32 or v4f32
6614  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
6615}
6616
6617static
6618SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
6619  SDValue V1 = Op.getOperand(0);
6620  SDValue V2 = Op.getOperand(1);
6621  EVT VT = Op.getValueType();
6622  unsigned NumElems = VT.getVectorNumElements();
6623
6624  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
6625  // operand of these instructions is only memory, so check if there's a
6626  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
6627  // same masks.
6628  bool CanFoldLoad = false;
6629
6630  // Trivial case, when V2 comes from a load.
6631  if (MayFoldVectorLoad(V2))
6632    CanFoldLoad = true;
6633
6634  // When V1 is a load, it can be folded later into a store in isel, example:
6635  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
6636  //    turns into:
6637  //  (MOVLPSmr addr:$src1, VR128:$src2)
6638  // So, recognize this potential and also use MOVLPS or MOVLPD
6639  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
6640    CanFoldLoad = true;
6641
6642  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6643  if (CanFoldLoad) {
6644    if (HasSSE2 && NumElems == 2)
6645      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
6646
6647    if (NumElems == 4)
6648      // If we don't care about the second element, proceed to use movss.
6649      if (SVOp->getMaskElt(1) != -1)
6650        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
6651  }
6652
6653  // movl and movlp will both match v2i64, but v2i64 is never matched by
6654  // movl earlier because we make it strict to avoid messing with the movlp load
6655  // folding logic (see the code above getMOVLP call). Match it here then,
6656  // this is horrible, but will stay like this until we move all shuffle
6657  // matching to x86 specific nodes. Note that for the 1st condition all
6658  // types are matched with movsd.
6659  if (HasSSE2) {
6660    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
6661    // as to remove this logic from here, as much as possible
6662    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
6663      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6664    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6665  }
6666
6667  assert(VT != MVT::v4i32 && "unsupported shuffle type");
6668
6669  // Invert the operand order and use SHUFPS to match it.
6670  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
6671                              getShuffleSHUFImmediate(SVOp), DAG);
6672}
6673
6674// Reduce a vector shuffle to zext.
6675SDValue
6676X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
6677  // PMOVZX is only available from SSE41.
6678  if (!Subtarget->hasSSE41())
6679    return SDValue();
6680
6681  EVT VT = Op.getValueType();
6682
6683  // Only AVX2 support 256-bit vector integer extending.
6684  if (!Subtarget->hasInt256() && VT.is256BitVector())
6685    return SDValue();
6686
6687  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6688  SDLoc DL(Op);
6689  SDValue V1 = Op.getOperand(0);
6690  SDValue V2 = Op.getOperand(1);
6691  unsigned NumElems = VT.getVectorNumElements();
6692
6693  // Extending is an unary operation and the element type of the source vector
6694  // won't be equal to or larger than i64.
6695  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
6696      VT.getVectorElementType() == MVT::i64)
6697    return SDValue();
6698
6699  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
6700  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
6701  while ((1U << Shift) < NumElems) {
6702    if (SVOp->getMaskElt(1U << Shift) == 1)
6703      break;
6704    Shift += 1;
6705    // The maximal ratio is 8, i.e. from i8 to i64.
6706    if (Shift > 3)
6707      return SDValue();
6708  }
6709
6710  // Check the shuffle mask.
6711  unsigned Mask = (1U << Shift) - 1;
6712  for (unsigned i = 0; i != NumElems; ++i) {
6713    int EltIdx = SVOp->getMaskElt(i);
6714    if ((i & Mask) != 0 && EltIdx != -1)
6715      return SDValue();
6716    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
6717      return SDValue();
6718  }
6719
6720  LLVMContext *Context = DAG.getContext();
6721  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
6722  EVT NeVT = EVT::getIntegerVT(*Context, NBits);
6723  EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
6724
6725  if (!isTypeLegal(NVT))
6726    return SDValue();
6727
6728  // Simplify the operand as it's prepared to be fed into shuffle.
6729  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
6730  if (V1.getOpcode() == ISD::BITCAST &&
6731      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6732      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6733      V1.getOperand(0)
6734        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
6735    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
6736    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
6737    ConstantSDNode *CIdx =
6738      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
6739    // If it's foldable, i.e. normal load with single use, we will let code
6740    // selection to fold it. Otherwise, we will short the conversion sequence.
6741    if (CIdx && CIdx->getZExtValue() == 0 &&
6742        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
6743      if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
6744        // The "ext_vec_elt" node is wider than the result node.
6745        // In this case we should extract subvector from V.
6746        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
6747        unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
6748        EVT FullVT = V.getValueType();
6749        EVT SubVecVT = EVT::getVectorVT(*Context,
6750                                        FullVT.getVectorElementType(),
6751                                        FullVT.getVectorNumElements()/Ratio);
6752        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
6753                        DAG.getIntPtrConstant(0));
6754      }
6755      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
6756    }
6757  }
6758
6759  return DAG.getNode(ISD::BITCAST, DL, VT,
6760                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
6761}
6762
6763SDValue
6764X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
6765  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6766  MVT VT = Op.getValueType().getSimpleVT();
6767  SDLoc dl(Op);
6768  SDValue V1 = Op.getOperand(0);
6769  SDValue V2 = Op.getOperand(1);
6770
6771  if (isZeroShuffle(SVOp))
6772    return getZeroVector(VT, Subtarget, DAG, dl);
6773
6774  // Handle splat operations
6775  if (SVOp->isSplat()) {
6776    // Use vbroadcast whenever the splat comes from a foldable load
6777    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
6778    if (Broadcast.getNode())
6779      return Broadcast;
6780  }
6781
6782  // Check integer expanding shuffles.
6783  SDValue NewOp = LowerVectorIntExtend(Op, DAG);
6784  if (NewOp.getNode())
6785    return NewOp;
6786
6787  // If the shuffle can be profitably rewritten as a narrower shuffle, then
6788  // do it!
6789  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
6790      VT == MVT::v16i16 || VT == MVT::v32i8) {
6791    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
6792    if (NewOp.getNode())
6793      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
6794  } else if ((VT == MVT::v4i32 ||
6795             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
6796    // FIXME: Figure out a cleaner way to do this.
6797    // Try to make use of movq to zero out the top part.
6798    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
6799      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
6800      if (NewOp.getNode()) {
6801        MVT NewVT = NewOp.getValueType().getSimpleVT();
6802        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
6803                               NewVT, true, false))
6804          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
6805                              DAG, Subtarget, dl);
6806      }
6807    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
6808      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
6809      if (NewOp.getNode()) {
6810        MVT NewVT = NewOp.getValueType().getSimpleVT();
6811        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
6812          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
6813                              DAG, Subtarget, dl);
6814      }
6815    }
6816  }
6817  return SDValue();
6818}
6819
6820SDValue
6821X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
6822  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6823  SDValue V1 = Op.getOperand(0);
6824  SDValue V2 = Op.getOperand(1);
6825  MVT VT = Op.getValueType().getSimpleVT();
6826  SDLoc dl(Op);
6827  unsigned NumElems = VT.getVectorNumElements();
6828  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
6829  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6830  bool V1IsSplat = false;
6831  bool V2IsSplat = false;
6832  bool HasSSE2 = Subtarget->hasSSE2();
6833  bool HasFp256    = Subtarget->hasFp256();
6834  bool HasInt256   = Subtarget->hasInt256();
6835  MachineFunction &MF = DAG.getMachineFunction();
6836  bool OptForSize = MF.getFunction()->getAttributes().
6837    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6838
6839  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
6840
6841  if (V1IsUndef && V2IsUndef)
6842    return DAG.getUNDEF(VT);
6843
6844  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
6845
6846  // Vector shuffle lowering takes 3 steps:
6847  //
6848  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
6849  //    narrowing and commutation of operands should be handled.
6850  // 2) Matching of shuffles with known shuffle masks to x86 target specific
6851  //    shuffle nodes.
6852  // 3) Rewriting of unmatched masks into new generic shuffle operations,
6853  //    so the shuffle can be broken into other shuffles and the legalizer can
6854  //    try the lowering again.
6855  //
6856  // The general idea is that no vector_shuffle operation should be left to
6857  // be matched during isel, all of them must be converted to a target specific
6858  // node here.
6859
6860  // Normalize the input vectors. Here splats, zeroed vectors, profitable
6861  // narrowing and commutation of operands should be handled. The actual code
6862  // doesn't include all of those, work in progress...
6863  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
6864  if (NewOp.getNode())
6865    return NewOp;
6866
6867  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
6868
6869  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
6870  // unpckh_undef). Only use pshufd if speed is more important than size.
6871  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
6872    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6873  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
6874    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6875
6876  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
6877      V2IsUndef && MayFoldVectorLoad(V1))
6878    return getMOVDDup(Op, dl, V1, DAG);
6879
6880  if (isMOVHLPS_v_undef_Mask(M, VT))
6881    return getMOVHighToLow(Op, dl, DAG);
6882
6883  // Use to match splats
6884  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
6885      (VT == MVT::v2f64 || VT == MVT::v2i64))
6886    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6887
6888  if (isPSHUFDMask(M, VT)) {
6889    // The actual implementation will match the mask in the if above and then
6890    // during isel it can match several different instructions, not only pshufd
6891    // as its name says, sad but true, emulate the behavior for now...
6892    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
6893      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
6894
6895    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
6896
6897    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
6898      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
6899
6900    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
6901      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
6902                                  DAG);
6903
6904    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
6905                                TargetMask, DAG);
6906  }
6907
6908  if (isPALIGNRMask(M, VT, Subtarget))
6909    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
6910                                getShufflePALIGNRImmediate(SVOp),
6911                                DAG);
6912
6913  // Check if this can be converted into a logical shift.
6914  bool isLeft = false;
6915  unsigned ShAmt = 0;
6916  SDValue ShVal;
6917  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
6918  if (isShift && ShVal.hasOneUse()) {
6919    // If the shifted value has multiple uses, it may be cheaper to use
6920    // v_set0 + movlhps or movhlps, etc.
6921    MVT EltVT = VT.getVectorElementType();
6922    ShAmt *= EltVT.getSizeInBits();
6923    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6924  }
6925
6926  if (isMOVLMask(M, VT)) {
6927    if (ISD::isBuildVectorAllZeros(V1.getNode()))
6928      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
6929    if (!isMOVLPMask(M, VT)) {
6930      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6931        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6932
6933      if (VT == MVT::v4i32 || VT == MVT::v4f32)
6934        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6935    }
6936  }
6937
6938  // FIXME: fold these into legal mask.
6939  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
6940    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
6941
6942  if (isMOVHLPSMask(M, VT))
6943    return getMOVHighToLow(Op, dl, DAG);
6944
6945  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
6946    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
6947
6948  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
6949    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
6950
6951  if (isMOVLPMask(M, VT))
6952    return getMOVLP(Op, dl, DAG, HasSSE2);
6953
6954  if (ShouldXformToMOVHLPS(M, VT) ||
6955      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
6956    return CommuteVectorShuffle(SVOp, DAG);
6957
6958  if (isShift) {
6959    // No better options. Use a vshldq / vsrldq.
6960    MVT EltVT = VT.getVectorElementType();
6961    ShAmt *= EltVT.getSizeInBits();
6962    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6963  }
6964
6965  bool Commuted = false;
6966  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
6967  // 1,1,1,1 -> v8i16 though.
6968  V1IsSplat = isSplatVector(V1.getNode());
6969  V2IsSplat = isSplatVector(V2.getNode());
6970
6971  // Canonicalize the splat or undef, if present, to be on the RHS.
6972  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
6973    CommuteVectorShuffleMask(M, NumElems);
6974    std::swap(V1, V2);
6975    std::swap(V1IsSplat, V2IsSplat);
6976    Commuted = true;
6977  }
6978
6979  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
6980    // Shuffling low element of v1 into undef, just return v1.
6981    if (V2IsUndef)
6982      return V1;
6983    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
6984    // the instruction selector will not match, so get a canonical MOVL with
6985    // swapped operands to undo the commute.
6986    return getMOVL(DAG, dl, VT, V2, V1);
6987  }
6988
6989  if (isUNPCKLMask(M, VT, HasInt256))
6990    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6991
6992  if (isUNPCKHMask(M, VT, HasInt256))
6993    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6994
6995  if (V2IsSplat) {
6996    // Normalize mask so all entries that point to V2 points to its first
6997    // element then try to match unpck{h|l} again. If match, return a
6998    // new vector_shuffle with the corrected mask.p
6999    SmallVector<int, 8> NewMask(M.begin(), M.end());
7000    NormalizeMask(NewMask, NumElems);
7001    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
7002      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7003    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
7004      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7005  }
7006
7007  if (Commuted) {
7008    // Commute is back and try unpck* again.
7009    // FIXME: this seems wrong.
7010    CommuteVectorShuffleMask(M, NumElems);
7011    std::swap(V1, V2);
7012    std::swap(V1IsSplat, V2IsSplat);
7013    Commuted = false;
7014
7015    if (isUNPCKLMask(M, VT, HasInt256))
7016      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7017
7018    if (isUNPCKHMask(M, VT, HasInt256))
7019      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7020  }
7021
7022  // Normalize the node to match x86 shuffle ops if needed
7023  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
7024    return CommuteVectorShuffle(SVOp, DAG);
7025
7026  // The checks below are all present in isShuffleMaskLegal, but they are
7027  // inlined here right now to enable us to directly emit target specific
7028  // nodes, and remove one by one until they don't return Op anymore.
7029
7030  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
7031      SVOp->getSplatIndex() == 0 && V2IsUndef) {
7032    if (VT == MVT::v2f64 || VT == MVT::v2i64)
7033      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7034  }
7035
7036  if (isPSHUFHWMask(M, VT, HasInt256))
7037    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
7038                                getShufflePSHUFHWImmediate(SVOp),
7039                                DAG);
7040
7041  if (isPSHUFLWMask(M, VT, HasInt256))
7042    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
7043                                getShufflePSHUFLWImmediate(SVOp),
7044                                DAG);
7045
7046  if (isSHUFPMask(M, VT, HasFp256))
7047    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
7048                                getShuffleSHUFImmediate(SVOp), DAG);
7049
7050  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
7051    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7052  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
7053    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7054
7055  //===--------------------------------------------------------------------===//
7056  // Generate target specific nodes for 128 or 256-bit shuffles only
7057  // supported in the AVX instruction set.
7058  //
7059
7060  // Handle VMOVDDUPY permutations
7061  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
7062    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
7063
7064  // Handle VPERMILPS/D* permutations
7065  if (isVPERMILPMask(M, VT, HasFp256)) {
7066    if (HasInt256 && VT == MVT::v8i32)
7067      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
7068                                  getShuffleSHUFImmediate(SVOp), DAG);
7069    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
7070                                getShuffleSHUFImmediate(SVOp), DAG);
7071  }
7072
7073  // Handle VPERM2F128/VPERM2I128 permutations
7074  if (isVPERM2X128Mask(M, VT, HasFp256))
7075    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
7076                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
7077
7078  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
7079  if (BlendOp.getNode())
7080    return BlendOp;
7081
7082  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
7083    SmallVector<SDValue, 8> permclMask;
7084    for (unsigned i = 0; i != 8; ++i) {
7085      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
7086    }
7087    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
7088                               &permclMask[0], 8);
7089    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
7090    return DAG.getNode(X86ISD::VPERMV, dl, VT,
7091                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
7092  }
7093
7094  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
7095    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
7096                                getShuffleCLImmediate(SVOp), DAG);
7097
7098  //===--------------------------------------------------------------------===//
7099  // Since no target specific shuffle was selected for this generic one,
7100  // lower it into other known shuffles. FIXME: this isn't true yet, but
7101  // this is the plan.
7102  //
7103
7104  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
7105  if (VT == MVT::v8i16) {
7106    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
7107    if (NewOp.getNode())
7108      return NewOp;
7109  }
7110
7111  if (VT == MVT::v16i8) {
7112    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
7113    if (NewOp.getNode())
7114      return NewOp;
7115  }
7116
7117  if (VT == MVT::v32i8) {
7118    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
7119    if (NewOp.getNode())
7120      return NewOp;
7121  }
7122
7123  // Handle all 128-bit wide vectors with 4 elements, and match them with
7124  // several different shuffle types.
7125  if (NumElems == 4 && VT.is128BitVector())
7126    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
7127
7128  // Handle general 256-bit shuffles
7129  if (VT.is256BitVector())
7130    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
7131
7132  return SDValue();
7133}
7134
7135static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
7136  MVT VT = Op.getValueType().getSimpleVT();
7137  SDLoc dl(Op);
7138
7139  if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
7140    return SDValue();
7141
7142  if (VT.getSizeInBits() == 8) {
7143    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
7144                                  Op.getOperand(0), Op.getOperand(1));
7145    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7146                                  DAG.getValueType(VT));
7147    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7148  }
7149
7150  if (VT.getSizeInBits() == 16) {
7151    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7152    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
7153    if (Idx == 0)
7154      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7155                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7156                                     DAG.getNode(ISD::BITCAST, dl,
7157                                                 MVT::v4i32,
7158                                                 Op.getOperand(0)),
7159                                     Op.getOperand(1)));
7160    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
7161                                  Op.getOperand(0), Op.getOperand(1));
7162    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7163                                  DAG.getValueType(VT));
7164    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7165  }
7166
7167  if (VT == MVT::f32) {
7168    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
7169    // the result back to FR32 register. It's only worth matching if the
7170    // result has a single use which is a store or a bitcast to i32.  And in
7171    // the case of a store, it's not worth it if the index is a constant 0,
7172    // because a MOVSSmr can be used instead, which is smaller and faster.
7173    if (!Op.hasOneUse())
7174      return SDValue();
7175    SDNode *User = *Op.getNode()->use_begin();
7176    if ((User->getOpcode() != ISD::STORE ||
7177         (isa<ConstantSDNode>(Op.getOperand(1)) &&
7178          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
7179        (User->getOpcode() != ISD::BITCAST ||
7180         User->getValueType(0) != MVT::i32))
7181      return SDValue();
7182    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7183                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
7184                                              Op.getOperand(0)),
7185                                              Op.getOperand(1));
7186    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
7187  }
7188
7189  if (VT == MVT::i32 || VT == MVT::i64) {
7190    // ExtractPS/pextrq works with constant index.
7191    if (isa<ConstantSDNode>(Op.getOperand(1)))
7192      return Op;
7193  }
7194  return SDValue();
7195}
7196
7197SDValue
7198X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
7199                                           SelectionDAG &DAG) const {
7200  if (!isa<ConstantSDNode>(Op.getOperand(1)))
7201    return SDValue();
7202
7203  SDValue Vec = Op.getOperand(0);
7204  MVT VecVT = Vec.getValueType().getSimpleVT();
7205
7206  // If this is a 256-bit vector result, first extract the 128-bit vector and
7207  // then extract the element from the 128-bit vector.
7208  if (VecVT.is256BitVector()) {
7209    SDLoc dl(Op.getNode());
7210    unsigned NumElems = VecVT.getVectorNumElements();
7211    SDValue Idx = Op.getOperand(1);
7212    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7213
7214    // Get the 128-bit vector.
7215    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
7216
7217    if (IdxVal >= NumElems/2)
7218      IdxVal -= NumElems/2;
7219    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
7220                       DAG.getConstant(IdxVal, MVT::i32));
7221  }
7222
7223  assert(VecVT.is128BitVector() && "Unexpected vector length");
7224
7225  if (Subtarget->hasSSE41()) {
7226    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
7227    if (Res.getNode())
7228      return Res;
7229  }
7230
7231  MVT VT = Op.getValueType().getSimpleVT();
7232  SDLoc dl(Op);
7233  // TODO: handle v16i8.
7234  if (VT.getSizeInBits() == 16) {
7235    SDValue Vec = Op.getOperand(0);
7236    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7237    if (Idx == 0)
7238      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7239                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7240                                     DAG.getNode(ISD::BITCAST, dl,
7241                                                 MVT::v4i32, Vec),
7242                                     Op.getOperand(1)));
7243    // Transform it so it match pextrw which produces a 32-bit result.
7244    MVT EltVT = MVT::i32;
7245    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
7246                                  Op.getOperand(0), Op.getOperand(1));
7247    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
7248                                  DAG.getValueType(VT));
7249    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7250  }
7251
7252  if (VT.getSizeInBits() == 32) {
7253    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7254    if (Idx == 0)
7255      return Op;
7256
7257    // SHUFPS the element to the lowest double word, then movss.
7258    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
7259    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
7260    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7261                                       DAG.getUNDEF(VVT), Mask);
7262    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7263                       DAG.getIntPtrConstant(0));
7264  }
7265
7266  if (VT.getSizeInBits() == 64) {
7267    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
7268    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
7269    //        to match extract_elt for f64.
7270    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7271    if (Idx == 0)
7272      return Op;
7273
7274    // UNPCKHPD the element to the lowest double word, then movsd.
7275    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
7276    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
7277    int Mask[2] = { 1, -1 };
7278    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
7279    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7280                                       DAG.getUNDEF(VVT), Mask);
7281    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7282                       DAG.getIntPtrConstant(0));
7283  }
7284
7285  return SDValue();
7286}
7287
7288static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
7289  MVT VT = Op.getValueType().getSimpleVT();
7290  MVT EltVT = VT.getVectorElementType();
7291  SDLoc dl(Op);
7292
7293  SDValue N0 = Op.getOperand(0);
7294  SDValue N1 = Op.getOperand(1);
7295  SDValue N2 = Op.getOperand(2);
7296
7297  if (!VT.is128BitVector())
7298    return SDValue();
7299
7300  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
7301      isa<ConstantSDNode>(N2)) {
7302    unsigned Opc;
7303    if (VT == MVT::v8i16)
7304      Opc = X86ISD::PINSRW;
7305    else if (VT == MVT::v16i8)
7306      Opc = X86ISD::PINSRB;
7307    else
7308      Opc = X86ISD::PINSRB;
7309
7310    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
7311    // argument.
7312    if (N1.getValueType() != MVT::i32)
7313      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7314    if (N2.getValueType() != MVT::i32)
7315      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7316    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
7317  }
7318
7319  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
7320    // Bits [7:6] of the constant are the source select.  This will always be
7321    //  zero here.  The DAG Combiner may combine an extract_elt index into these
7322    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
7323    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
7324    // Bits [5:4] of the constant are the destination select.  This is the
7325    //  value of the incoming immediate.
7326    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
7327    //   combine either bitwise AND or insert of float 0.0 to set these bits.
7328    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
7329    // Create this as a scalar to vector..
7330    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
7331    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
7332  }
7333
7334  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
7335    // PINSR* works with constant index.
7336    return Op;
7337  }
7338  return SDValue();
7339}
7340
7341SDValue
7342X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
7343  MVT VT = Op.getValueType().getSimpleVT();
7344  MVT EltVT = VT.getVectorElementType();
7345
7346  SDLoc dl(Op);
7347  SDValue N0 = Op.getOperand(0);
7348  SDValue N1 = Op.getOperand(1);
7349  SDValue N2 = Op.getOperand(2);
7350
7351  // If this is a 256-bit vector result, first extract the 128-bit vector,
7352  // insert the element into the extracted half and then place it back.
7353  if (VT.is256BitVector()) {
7354    if (!isa<ConstantSDNode>(N2))
7355      return SDValue();
7356
7357    // Get the desired 128-bit vector half.
7358    unsigned NumElems = VT.getVectorNumElements();
7359    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
7360    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
7361
7362    // Insert the element into the desired half.
7363    bool Upper = IdxVal >= NumElems/2;
7364    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
7365                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
7366
7367    // Insert the changed part back to the 256-bit vector
7368    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
7369  }
7370
7371  if (Subtarget->hasSSE41())
7372    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
7373
7374  if (EltVT == MVT::i8)
7375    return SDValue();
7376
7377  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
7378    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
7379    // as its second argument.
7380    if (N1.getValueType() != MVT::i32)
7381      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7382    if (N2.getValueType() != MVT::i32)
7383      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7384    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
7385  }
7386  return SDValue();
7387}
7388
7389static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
7390  LLVMContext *Context = DAG.getContext();
7391  SDLoc dl(Op);
7392  MVT OpVT = Op.getValueType().getSimpleVT();
7393
7394  // If this is a 256-bit vector result, first insert into a 128-bit
7395  // vector and then insert into the 256-bit vector.
7396  if (!OpVT.is128BitVector()) {
7397    // Insert into a 128-bit vector.
7398    EVT VT128 = EVT::getVectorVT(*Context,
7399                                 OpVT.getVectorElementType(),
7400                                 OpVT.getVectorNumElements() / 2);
7401
7402    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
7403
7404    // Insert the 128-bit vector.
7405    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
7406  }
7407
7408  if (OpVT == MVT::v1i64 &&
7409      Op.getOperand(0).getValueType() == MVT::i64)
7410    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
7411
7412  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
7413  assert(OpVT.is128BitVector() && "Expected an SSE type!");
7414  return DAG.getNode(ISD::BITCAST, dl, OpVT,
7415                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
7416}
7417
7418// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
7419// a simple subregister reference or explicit instructions to grab
7420// upper bits of a vector.
7421static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7422                                      SelectionDAG &DAG) {
7423  if (Subtarget->hasFp256()) {
7424    SDLoc dl(Op.getNode());
7425    SDValue Vec = Op.getNode()->getOperand(0);
7426    SDValue Idx = Op.getNode()->getOperand(1);
7427
7428    if (Op.getNode()->getValueType(0).is128BitVector() &&
7429        Vec.getNode()->getValueType(0).is256BitVector() &&
7430        isa<ConstantSDNode>(Idx)) {
7431      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7432      return Extract128BitVector(Vec, IdxVal, DAG, dl);
7433    }
7434  }
7435  return SDValue();
7436}
7437
7438// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
7439// simple superregister reference or explicit instructions to insert
7440// the upper bits of a vector.
7441static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7442                                     SelectionDAG &DAG) {
7443  if (Subtarget->hasFp256()) {
7444    SDLoc dl(Op.getNode());
7445    SDValue Vec = Op.getNode()->getOperand(0);
7446    SDValue SubVec = Op.getNode()->getOperand(1);
7447    SDValue Idx = Op.getNode()->getOperand(2);
7448
7449    if (Op.getNode()->getValueType(0).is256BitVector() &&
7450        SubVec.getNode()->getValueType(0).is128BitVector() &&
7451        isa<ConstantSDNode>(Idx)) {
7452      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7453      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7454    }
7455  }
7456  return SDValue();
7457}
7458
7459// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7460// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7461// one of the above mentioned nodes. It has to be wrapped because otherwise
7462// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7463// be used to form addressing mode. These wrapped nodes will be selected
7464// into MOV32ri.
7465SDValue
7466X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7467  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7468
7469  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7470  // global base reg.
7471  unsigned char OpFlag = 0;
7472  unsigned WrapperKind = X86ISD::Wrapper;
7473  CodeModel::Model M = getTargetMachine().getCodeModel();
7474
7475  if (Subtarget->isPICStyleRIPRel() &&
7476      (M == CodeModel::Small || M == CodeModel::Kernel))
7477    WrapperKind = X86ISD::WrapperRIP;
7478  else if (Subtarget->isPICStyleGOT())
7479    OpFlag = X86II::MO_GOTOFF;
7480  else if (Subtarget->isPICStyleStubPIC())
7481    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7482
7483  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7484                                             CP->getAlignment(),
7485                                             CP->getOffset(), OpFlag);
7486  SDLoc DL(CP);
7487  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7488  // With PIC, the address is actually $g + Offset.
7489  if (OpFlag) {
7490    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7491                         DAG.getNode(X86ISD::GlobalBaseReg,
7492                                     SDLoc(), getPointerTy()),
7493                         Result);
7494  }
7495
7496  return Result;
7497}
7498
7499SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7500  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7501
7502  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7503  // global base reg.
7504  unsigned char OpFlag = 0;
7505  unsigned WrapperKind = X86ISD::Wrapper;
7506  CodeModel::Model M = getTargetMachine().getCodeModel();
7507
7508  if (Subtarget->isPICStyleRIPRel() &&
7509      (M == CodeModel::Small || M == CodeModel::Kernel))
7510    WrapperKind = X86ISD::WrapperRIP;
7511  else if (Subtarget->isPICStyleGOT())
7512    OpFlag = X86II::MO_GOTOFF;
7513  else if (Subtarget->isPICStyleStubPIC())
7514    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7515
7516  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7517                                          OpFlag);
7518  SDLoc DL(JT);
7519  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7520
7521  // With PIC, the address is actually $g + Offset.
7522  if (OpFlag)
7523    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7524                         DAG.getNode(X86ISD::GlobalBaseReg,
7525                                     SDLoc(), getPointerTy()),
7526                         Result);
7527
7528  return Result;
7529}
7530
7531SDValue
7532X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
7533  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
7534
7535  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7536  // global base reg.
7537  unsigned char OpFlag = 0;
7538  unsigned WrapperKind = X86ISD::Wrapper;
7539  CodeModel::Model M = getTargetMachine().getCodeModel();
7540
7541  if (Subtarget->isPICStyleRIPRel() &&
7542      (M == CodeModel::Small || M == CodeModel::Kernel)) {
7543    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
7544      OpFlag = X86II::MO_GOTPCREL;
7545    WrapperKind = X86ISD::WrapperRIP;
7546  } else if (Subtarget->isPICStyleGOT()) {
7547    OpFlag = X86II::MO_GOT;
7548  } else if (Subtarget->isPICStyleStubPIC()) {
7549    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
7550  } else if (Subtarget->isPICStyleStubNoDynamic()) {
7551    OpFlag = X86II::MO_DARWIN_NONLAZY;
7552  }
7553
7554  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
7555
7556  SDLoc DL(Op);
7557  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7558
7559  // With PIC, the address is actually $g + Offset.
7560  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
7561      !Subtarget->is64Bit()) {
7562    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7563                         DAG.getNode(X86ISD::GlobalBaseReg,
7564                                     SDLoc(), getPointerTy()),
7565                         Result);
7566  }
7567
7568  // For symbols that require a load from a stub to get the address, emit the
7569  // load.
7570  if (isGlobalStubReference(OpFlag))
7571    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
7572                         MachinePointerInfo::getGOT(), false, false, false, 0);
7573
7574  return Result;
7575}
7576
7577SDValue
7578X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
7579  // Create the TargetBlockAddressAddress node.
7580  unsigned char OpFlags =
7581    Subtarget->ClassifyBlockAddressReference();
7582  CodeModel::Model M = getTargetMachine().getCodeModel();
7583  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
7584  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
7585  SDLoc dl(Op);
7586  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
7587                                             OpFlags);
7588
7589  if (Subtarget->isPICStyleRIPRel() &&
7590      (M == CodeModel::Small || M == CodeModel::Kernel))
7591    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7592  else
7593    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7594
7595  // With PIC, the address is actually $g + Offset.
7596  if (isGlobalRelativeToPICBase(OpFlags)) {
7597    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7598                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7599                         Result);
7600  }
7601
7602  return Result;
7603}
7604
7605SDValue
7606X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
7607                                      int64_t Offset, SelectionDAG &DAG) const {
7608  // Create the TargetGlobalAddress node, folding in the constant
7609  // offset if it is legal.
7610  unsigned char OpFlags =
7611    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7612  CodeModel::Model M = getTargetMachine().getCodeModel();
7613  SDValue Result;
7614  if (OpFlags == X86II::MO_NO_FLAG &&
7615      X86::isOffsetSuitableForCodeModel(Offset, M)) {
7616    // A direct static reference to a global.
7617    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
7618    Offset = 0;
7619  } else {
7620    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
7621  }
7622
7623  if (Subtarget->isPICStyleRIPRel() &&
7624      (M == CodeModel::Small || M == CodeModel::Kernel))
7625    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7626  else
7627    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7628
7629  // With PIC, the address is actually $g + Offset.
7630  if (isGlobalRelativeToPICBase(OpFlags)) {
7631    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7632                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7633                         Result);
7634  }
7635
7636  // For globals that require a load from a stub to get the address, emit the
7637  // load.
7638  if (isGlobalStubReference(OpFlags))
7639    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
7640                         MachinePointerInfo::getGOT(), false, false, false, 0);
7641
7642  // If there was a non-zero offset that we didn't fold, create an explicit
7643  // addition for it.
7644  if (Offset != 0)
7645    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
7646                         DAG.getConstant(Offset, getPointerTy()));
7647
7648  return Result;
7649}
7650
7651SDValue
7652X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
7653  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7654  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
7655  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
7656}
7657
7658static SDValue
7659GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
7660           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
7661           unsigned char OperandFlags, bool LocalDynamic = false) {
7662  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7663  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7664  SDLoc dl(GA);
7665  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7666                                           GA->getValueType(0),
7667                                           GA->getOffset(),
7668                                           OperandFlags);
7669
7670  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
7671                                           : X86ISD::TLSADDR;
7672
7673  if (InFlag) {
7674    SDValue Ops[] = { Chain,  TGA, *InFlag };
7675    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
7676  } else {
7677    SDValue Ops[]  = { Chain, TGA };
7678    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
7679  }
7680
7681  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
7682  MFI->setAdjustsStack(true);
7683
7684  SDValue Flag = Chain.getValue(1);
7685  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
7686}
7687
7688// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
7689static SDValue
7690LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7691                                const EVT PtrVT) {
7692  SDValue InFlag;
7693  SDLoc dl(GA);  // ? function entry point might be better
7694  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7695                                   DAG.getNode(X86ISD::GlobalBaseReg,
7696                                               SDLoc(), PtrVT), InFlag);
7697  InFlag = Chain.getValue(1);
7698
7699  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
7700}
7701
7702// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
7703static SDValue
7704LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7705                                const EVT PtrVT) {
7706  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
7707                    X86::RAX, X86II::MO_TLSGD);
7708}
7709
7710static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
7711                                           SelectionDAG &DAG,
7712                                           const EVT PtrVT,
7713                                           bool is64Bit) {
7714  SDLoc dl(GA);
7715
7716  // Get the start address of the TLS block for this module.
7717  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
7718      .getInfo<X86MachineFunctionInfo>();
7719  MFI->incNumLocalDynamicTLSAccesses();
7720
7721  SDValue Base;
7722  if (is64Bit) {
7723    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
7724                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
7725  } else {
7726    SDValue InFlag;
7727    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7728        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
7729    InFlag = Chain.getValue(1);
7730    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
7731                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
7732  }
7733
7734  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
7735  // of Base.
7736
7737  // Build x@dtpoff.
7738  unsigned char OperandFlags = X86II::MO_DTPOFF;
7739  unsigned WrapperKind = X86ISD::Wrapper;
7740  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7741                                           GA->getValueType(0),
7742                                           GA->getOffset(), OperandFlags);
7743  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7744
7745  // Add x@dtpoff with the base.
7746  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
7747}
7748
7749// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
7750static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7751                                   const EVT PtrVT, TLSModel::Model model,
7752                                   bool is64Bit, bool isPIC) {
7753  SDLoc dl(GA);
7754
7755  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
7756  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
7757                                                         is64Bit ? 257 : 256));
7758
7759  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
7760                                      DAG.getIntPtrConstant(0),
7761                                      MachinePointerInfo(Ptr),
7762                                      false, false, false, 0);
7763
7764  unsigned char OperandFlags = 0;
7765  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
7766  // initialexec.
7767  unsigned WrapperKind = X86ISD::Wrapper;
7768  if (model == TLSModel::LocalExec) {
7769    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
7770  } else if (model == TLSModel::InitialExec) {
7771    if (is64Bit) {
7772      OperandFlags = X86II::MO_GOTTPOFF;
7773      WrapperKind = X86ISD::WrapperRIP;
7774    } else {
7775      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
7776    }
7777  } else {
7778    llvm_unreachable("Unexpected model");
7779  }
7780
7781  // emit "addl x@ntpoff,%eax" (local exec)
7782  // or "addl x@indntpoff,%eax" (initial exec)
7783  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
7784  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7785                                           GA->getValueType(0),
7786                                           GA->getOffset(), OperandFlags);
7787  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7788
7789  if (model == TLSModel::InitialExec) {
7790    if (isPIC && !is64Bit) {
7791      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
7792                          DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
7793                           Offset);
7794    }
7795
7796    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
7797                         MachinePointerInfo::getGOT(), false, false, false,
7798                         0);
7799  }
7800
7801  // The address of the thread local variable is the add of the thread
7802  // pointer with the offset of the variable.
7803  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
7804}
7805
7806SDValue
7807X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
7808
7809  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7810  const GlobalValue *GV = GA->getGlobal();
7811
7812  if (Subtarget->isTargetELF()) {
7813    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
7814
7815    switch (model) {
7816      case TLSModel::GeneralDynamic:
7817        if (Subtarget->is64Bit())
7818          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
7819        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
7820      case TLSModel::LocalDynamic:
7821        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
7822                                           Subtarget->is64Bit());
7823      case TLSModel::InitialExec:
7824      case TLSModel::LocalExec:
7825        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
7826                                   Subtarget->is64Bit(),
7827                        getTargetMachine().getRelocationModel() == Reloc::PIC_);
7828    }
7829    llvm_unreachable("Unknown TLS model.");
7830  }
7831
7832  if (Subtarget->isTargetDarwin()) {
7833    // Darwin only has one model of TLS.  Lower to that.
7834    unsigned char OpFlag = 0;
7835    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
7836                           X86ISD::WrapperRIP : X86ISD::Wrapper;
7837
7838    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7839    // global base reg.
7840    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
7841                  !Subtarget->is64Bit();
7842    if (PIC32)
7843      OpFlag = X86II::MO_TLVP_PIC_BASE;
7844    else
7845      OpFlag = X86II::MO_TLVP;
7846    SDLoc DL(Op);
7847    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
7848                                                GA->getValueType(0),
7849                                                GA->getOffset(), OpFlag);
7850    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7851
7852    // With PIC32, the address is actually $g + Offset.
7853    if (PIC32)
7854      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7855                           DAG.getNode(X86ISD::GlobalBaseReg,
7856                                       SDLoc(), getPointerTy()),
7857                           Offset);
7858
7859    // Lowering the machine isd will make sure everything is in the right
7860    // location.
7861    SDValue Chain = DAG.getEntryNode();
7862    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7863    SDValue Args[] = { Chain, Offset };
7864    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
7865
7866    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
7867    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7868    MFI->setAdjustsStack(true);
7869
7870    // And our return value (tls address) is in the standard call return value
7871    // location.
7872    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
7873    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
7874                              Chain.getValue(1));
7875  }
7876
7877  if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
7878    // Just use the implicit TLS architecture
7879    // Need to generate someting similar to:
7880    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
7881    //                                  ; from TEB
7882    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
7883    //   mov     rcx, qword [rdx+rcx*8]
7884    //   mov     eax, .tls$:tlsvar
7885    //   [rax+rcx] contains the address
7886    // Windows 64bit: gs:0x58
7887    // Windows 32bit: fs:__tls_array
7888
7889    // If GV is an alias then use the aliasee for determining
7890    // thread-localness.
7891    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7892      GV = GA->resolveAliasedGlobal(false);
7893    SDLoc dl(GA);
7894    SDValue Chain = DAG.getEntryNode();
7895
7896    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
7897    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
7898    // use its literal value of 0x2C.
7899    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
7900                                        ? Type::getInt8PtrTy(*DAG.getContext(),
7901                                                             256)
7902                                        : Type::getInt32PtrTy(*DAG.getContext(),
7903                                                              257));
7904
7905    SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
7906      (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
7907        DAG.getExternalSymbol("_tls_array", getPointerTy()));
7908
7909    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
7910                                        MachinePointerInfo(Ptr),
7911                                        false, false, false, 0);
7912
7913    // Load the _tls_index variable
7914    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
7915    if (Subtarget->is64Bit())
7916      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
7917                           IDX, MachinePointerInfo(), MVT::i32,
7918                           false, false, 0);
7919    else
7920      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
7921                        false, false, false, 0);
7922
7923    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
7924                                    getPointerTy());
7925    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
7926
7927    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
7928    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
7929                      false, false, false, 0);
7930
7931    // Get the offset of start of .tls section
7932    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7933                                             GA->getValueType(0),
7934                                             GA->getOffset(), X86II::MO_SECREL);
7935    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
7936
7937    // The address of the thread local variable is the add of the thread
7938    // pointer with the offset of the variable.
7939    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
7940  }
7941
7942  llvm_unreachable("TLS not implemented for this target.");
7943}
7944
7945/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
7946/// and take a 2 x i32 value to shift plus a shift amount.
7947SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
7948  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7949  EVT VT = Op.getValueType();
7950  unsigned VTBits = VT.getSizeInBits();
7951  SDLoc dl(Op);
7952  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
7953  SDValue ShOpLo = Op.getOperand(0);
7954  SDValue ShOpHi = Op.getOperand(1);
7955  SDValue ShAmt  = Op.getOperand(2);
7956  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
7957                                     DAG.getConstant(VTBits - 1, MVT::i8))
7958                       : DAG.getConstant(0, VT);
7959
7960  SDValue Tmp2, Tmp3;
7961  if (Op.getOpcode() == ISD::SHL_PARTS) {
7962    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
7963    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7964  } else {
7965    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
7966    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
7967  }
7968
7969  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
7970                                DAG.getConstant(VTBits, MVT::i8));
7971  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7972                             AndNode, DAG.getConstant(0, MVT::i8));
7973
7974  SDValue Hi, Lo;
7975  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7976  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
7977  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
7978
7979  if (Op.getOpcode() == ISD::SHL_PARTS) {
7980    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7981    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7982  } else {
7983    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7984    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7985  }
7986
7987  SDValue Ops[2] = { Lo, Hi };
7988  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
7989}
7990
7991SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
7992                                           SelectionDAG &DAG) const {
7993  EVT SrcVT = Op.getOperand(0).getValueType();
7994
7995  if (SrcVT.isVector())
7996    return SDValue();
7997
7998  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
7999         "Unknown SINT_TO_FP to lower!");
8000
8001  // These are really Legal; return the operand so the caller accepts it as
8002  // Legal.
8003  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
8004    return Op;
8005  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
8006      Subtarget->is64Bit()) {
8007    return Op;
8008  }
8009
8010  SDLoc dl(Op);
8011  unsigned Size = SrcVT.getSizeInBits()/8;
8012  MachineFunction &MF = DAG.getMachineFunction();
8013  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
8014  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8015  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8016                               StackSlot,
8017                               MachinePointerInfo::getFixedStack(SSFI),
8018                               false, false, 0);
8019  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
8020}
8021
8022SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
8023                                     SDValue StackSlot,
8024                                     SelectionDAG &DAG) const {
8025  // Build the FILD
8026  SDLoc DL(Op);
8027  SDVTList Tys;
8028  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
8029  if (useSSE)
8030    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
8031  else
8032    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
8033
8034  unsigned ByteSize = SrcVT.getSizeInBits()/8;
8035
8036  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
8037  MachineMemOperand *MMO;
8038  if (FI) {
8039    int SSFI = FI->getIndex();
8040    MMO =
8041      DAG.getMachineFunction()
8042      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8043                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
8044  } else {
8045    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
8046    StackSlot = StackSlot.getOperand(1);
8047  }
8048  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
8049  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
8050                                           X86ISD::FILD, DL,
8051                                           Tys, Ops, array_lengthof(Ops),
8052                                           SrcVT, MMO);
8053
8054  if (useSSE) {
8055    Chain = Result.getValue(1);
8056    SDValue InFlag = Result.getValue(2);
8057
8058    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
8059    // shouldn't be necessary except that RFP cannot be live across
8060    // multiple blocks. When stackifier is fixed, they can be uncoupled.
8061    MachineFunction &MF = DAG.getMachineFunction();
8062    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
8063    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
8064    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8065    Tys = DAG.getVTList(MVT::Other);
8066    SDValue Ops[] = {
8067      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
8068    };
8069    MachineMemOperand *MMO =
8070      DAG.getMachineFunction()
8071      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8072                            MachineMemOperand::MOStore, SSFISize, SSFISize);
8073
8074    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
8075                                    Ops, array_lengthof(Ops),
8076                                    Op.getValueType(), MMO);
8077    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
8078                         MachinePointerInfo::getFixedStack(SSFI),
8079                         false, false, false, 0);
8080  }
8081
8082  return Result;
8083}
8084
8085// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
8086SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
8087                                               SelectionDAG &DAG) const {
8088  // This algorithm is not obvious. Here it is what we're trying to output:
8089  /*
8090     movq       %rax,  %xmm0
8091     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
8092     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
8093     #ifdef __SSE3__
8094       haddpd   %xmm0, %xmm0
8095     #else
8096       pshufd   $0x4e, %xmm0, %xmm1
8097       addpd    %xmm1, %xmm0
8098     #endif
8099  */
8100
8101  SDLoc dl(Op);
8102  LLVMContext *Context = DAG.getContext();
8103
8104  // Build some magic constants.
8105  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
8106  Constant *C0 = ConstantDataVector::get(*Context, CV0);
8107  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
8108
8109  SmallVector<Constant*,2> CV1;
8110  CV1.push_back(
8111    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
8112                                      APInt(64, 0x4330000000000000ULL))));
8113  CV1.push_back(
8114    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
8115                                      APInt(64, 0x4530000000000000ULL))));
8116  Constant *C1 = ConstantVector::get(CV1);
8117  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
8118
8119  // Load the 64-bit value into an XMM register.
8120  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
8121                            Op.getOperand(0));
8122  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
8123                              MachinePointerInfo::getConstantPool(),
8124                              false, false, false, 16);
8125  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
8126                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
8127                              CLod0);
8128
8129  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
8130                              MachinePointerInfo::getConstantPool(),
8131                              false, false, false, 16);
8132  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
8133  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
8134  SDValue Result;
8135
8136  if (Subtarget->hasSSE3()) {
8137    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
8138    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
8139  } else {
8140    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
8141    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
8142                                           S2F, 0x4E, DAG);
8143    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
8144                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
8145                         Sub);
8146  }
8147
8148  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
8149                     DAG.getIntPtrConstant(0));
8150}
8151
8152// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
8153SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
8154                                               SelectionDAG &DAG) const {
8155  SDLoc dl(Op);
8156  // FP constant to bias correct the final result.
8157  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
8158                                   MVT::f64);
8159
8160  // Load the 32-bit value into an XMM register.
8161  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
8162                             Op.getOperand(0));
8163
8164  // Zero out the upper parts of the register.
8165  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
8166
8167  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8168                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
8169                     DAG.getIntPtrConstant(0));
8170
8171  // Or the load with the bias.
8172  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
8173                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8174                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8175                                                   MVT::v2f64, Load)),
8176                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8177                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8178                                                   MVT::v2f64, Bias)));
8179  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8180                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
8181                   DAG.getIntPtrConstant(0));
8182
8183  // Subtract the bias.
8184  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
8185
8186  // Handle final rounding.
8187  EVT DestVT = Op.getValueType();
8188
8189  if (DestVT.bitsLT(MVT::f64))
8190    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
8191                       DAG.getIntPtrConstant(0));
8192  if (DestVT.bitsGT(MVT::f64))
8193    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
8194
8195  // Handle final rounding.
8196  return Sub;
8197}
8198
8199SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
8200                                               SelectionDAG &DAG) const {
8201  SDValue N0 = Op.getOperand(0);
8202  EVT SVT = N0.getValueType();
8203  SDLoc dl(Op);
8204
8205  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
8206          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
8207         "Custom UINT_TO_FP is not supported!");
8208
8209  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
8210                             SVT.getVectorNumElements());
8211  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
8212                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
8213}
8214
8215SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
8216                                           SelectionDAG &DAG) const {
8217  SDValue N0 = Op.getOperand(0);
8218  SDLoc dl(Op);
8219
8220  if (Op.getValueType().isVector())
8221    return lowerUINT_TO_FP_vec(Op, DAG);
8222
8223  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
8224  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
8225  // the optimization here.
8226  if (DAG.SignBitIsZero(N0))
8227    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
8228
8229  EVT SrcVT = N0.getValueType();
8230  EVT DstVT = Op.getValueType();
8231  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
8232    return LowerUINT_TO_FP_i64(Op, DAG);
8233  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
8234    return LowerUINT_TO_FP_i32(Op, DAG);
8235  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
8236    return SDValue();
8237
8238  // Make a 64-bit buffer, and use it to build an FILD.
8239  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
8240  if (SrcVT == MVT::i32) {
8241    SDValue WordOff = DAG.getConstant(4, getPointerTy());
8242    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
8243                                     getPointerTy(), StackSlot, WordOff);
8244    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8245                                  StackSlot, MachinePointerInfo(),
8246                                  false, false, 0);
8247    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
8248                                  OffsetSlot, MachinePointerInfo(),
8249                                  false, false, 0);
8250    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
8251    return Fild;
8252  }
8253
8254  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
8255  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8256                               StackSlot, MachinePointerInfo(),
8257                               false, false, 0);
8258  // For i64 source, we need to add the appropriate power of 2 if the input
8259  // was negative.  This is the same as the optimization in
8260  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
8261  // we must be careful to do the computation in x87 extended precision, not
8262  // in SSE. (The generic code can't know it's OK to do this, or how to.)
8263  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
8264  MachineMemOperand *MMO =
8265    DAG.getMachineFunction()
8266    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8267                          MachineMemOperand::MOLoad, 8, 8);
8268
8269  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
8270  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
8271  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
8272                                         array_lengthof(Ops), MVT::i64, MMO);
8273
8274  APInt FF(32, 0x5F800000ULL);
8275
8276  // Check whether the sign bit is set.
8277  SDValue SignSet = DAG.getSetCC(dl,
8278                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
8279                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
8280                                 ISD::SETLT);
8281
8282  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
8283  SDValue FudgePtr = DAG.getConstantPool(
8284                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
8285                                         getPointerTy());
8286
8287  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
8288  SDValue Zero = DAG.getIntPtrConstant(0);
8289  SDValue Four = DAG.getIntPtrConstant(4);
8290  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
8291                               Zero, Four);
8292  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
8293
8294  // Load the value out, extending it from f32 to f80.
8295  // FIXME: Avoid the extend by constructing the right constant pool?
8296  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
8297                                 FudgePtr, MachinePointerInfo::getConstantPool(),
8298                                 MVT::f32, false, false, 4);
8299  // Extend everything to 80 bits to force it to be done on x87.
8300  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
8301  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
8302}
8303
8304std::pair<SDValue,SDValue>
8305X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
8306                                    bool IsSigned, bool IsReplace) const {
8307  SDLoc DL(Op);
8308
8309  EVT DstTy = Op.getValueType();
8310
8311  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
8312    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
8313    DstTy = MVT::i64;
8314  }
8315
8316  assert(DstTy.getSimpleVT() <= MVT::i64 &&
8317         DstTy.getSimpleVT() >= MVT::i16 &&
8318         "Unknown FP_TO_INT to lower!");
8319
8320  // These are really Legal.
8321  if (DstTy == MVT::i32 &&
8322      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8323    return std::make_pair(SDValue(), SDValue());
8324  if (Subtarget->is64Bit() &&
8325      DstTy == MVT::i64 &&
8326      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8327    return std::make_pair(SDValue(), SDValue());
8328
8329  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
8330  // stack slot, or into the FTOL runtime function.
8331  MachineFunction &MF = DAG.getMachineFunction();
8332  unsigned MemSize = DstTy.getSizeInBits()/8;
8333  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8334  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8335
8336  unsigned Opc;
8337  if (!IsSigned && isIntegerTypeFTOL(DstTy))
8338    Opc = X86ISD::WIN_FTOL;
8339  else
8340    switch (DstTy.getSimpleVT().SimpleTy) {
8341    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
8342    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
8343    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
8344    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
8345    }
8346
8347  SDValue Chain = DAG.getEntryNode();
8348  SDValue Value = Op.getOperand(0);
8349  EVT TheVT = Op.getOperand(0).getValueType();
8350  // FIXME This causes a redundant load/store if the SSE-class value is already
8351  // in memory, such as if it is on the callstack.
8352  if (isScalarFPTypeInSSEReg(TheVT)) {
8353    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
8354    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
8355                         MachinePointerInfo::getFixedStack(SSFI),
8356                         false, false, 0);
8357    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
8358    SDValue Ops[] = {
8359      Chain, StackSlot, DAG.getValueType(TheVT)
8360    };
8361
8362    MachineMemOperand *MMO =
8363      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8364                              MachineMemOperand::MOLoad, MemSize, MemSize);
8365    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
8366                                    array_lengthof(Ops), DstTy, MMO);
8367    Chain = Value.getValue(1);
8368    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8369    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8370  }
8371
8372  MachineMemOperand *MMO =
8373    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8374                            MachineMemOperand::MOStore, MemSize, MemSize);
8375
8376  if (Opc != X86ISD::WIN_FTOL) {
8377    // Build the FP_TO_INT*_IN_MEM
8378    SDValue Ops[] = { Chain, Value, StackSlot };
8379    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
8380                                           Ops, array_lengthof(Ops), DstTy,
8381                                           MMO);
8382    return std::make_pair(FIST, StackSlot);
8383  } else {
8384    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
8385      DAG.getVTList(MVT::Other, MVT::Glue),
8386      Chain, Value);
8387    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
8388      MVT::i32, ftol.getValue(1));
8389    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
8390      MVT::i32, eax.getValue(2));
8391    SDValue Ops[] = { eax, edx };
8392    SDValue pair = IsReplace
8393      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
8394      : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
8395    return std::make_pair(pair, SDValue());
8396  }
8397}
8398
8399static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
8400                              const X86Subtarget *Subtarget) {
8401  MVT VT = Op->getValueType(0).getSimpleVT();
8402  SDValue In = Op->getOperand(0);
8403  MVT InVT = In.getValueType().getSimpleVT();
8404  SDLoc dl(Op);
8405
8406  // Optimize vectors in AVX mode:
8407  //
8408  //   v8i16 -> v8i32
8409  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
8410  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
8411  //   Concat upper and lower parts.
8412  //
8413  //   v4i32 -> v4i64
8414  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
8415  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
8416  //   Concat upper and lower parts.
8417  //
8418
8419  if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
8420      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
8421    return SDValue();
8422
8423  if (Subtarget->hasInt256())
8424    return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
8425
8426  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
8427  SDValue Undef = DAG.getUNDEF(InVT);
8428  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
8429  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
8430  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
8431
8432  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
8433                             VT.getVectorNumElements()/2);
8434
8435  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
8436  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
8437
8438  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
8439}
8440
8441SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
8442                                           SelectionDAG &DAG) const {
8443  if (Subtarget->hasFp256()) {
8444    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
8445    if (Res.getNode())
8446      return Res;
8447  }
8448
8449  return SDValue();
8450}
8451SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
8452                                            SelectionDAG &DAG) const {
8453  SDLoc DL(Op);
8454  MVT VT = Op.getValueType().getSimpleVT();
8455  SDValue In = Op.getOperand(0);
8456  MVT SVT = In.getValueType().getSimpleVT();
8457
8458  if (Subtarget->hasFp256()) {
8459    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
8460    if (Res.getNode())
8461      return Res;
8462  }
8463
8464  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
8465      VT.getVectorNumElements() != SVT.getVectorNumElements())
8466    return SDValue();
8467
8468  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
8469
8470  // AVX2 has better support of integer extending.
8471  if (Subtarget->hasInt256())
8472    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
8473
8474  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
8475  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
8476  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
8477                           DAG.getVectorShuffle(MVT::v8i16, DL, In,
8478                                                DAG.getUNDEF(MVT::v8i16),
8479                                                &Mask[0]));
8480
8481  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
8482}
8483
8484SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8485  SDLoc DL(Op);
8486  MVT VT = Op.getValueType().getSimpleVT();
8487  SDValue In = Op.getOperand(0);
8488  MVT SVT = In.getValueType().getSimpleVT();
8489
8490  if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
8491    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
8492    if (Subtarget->hasInt256()) {
8493      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
8494      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
8495      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
8496                                ShufMask);
8497      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
8498                         DAG.getIntPtrConstant(0));
8499    }
8500
8501    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
8502    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
8503                               DAG.getIntPtrConstant(0));
8504    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
8505                               DAG.getIntPtrConstant(2));
8506
8507    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
8508    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
8509
8510    // The PSHUFD mask:
8511    static const int ShufMask1[] = {0, 2, 0, 0};
8512    SDValue Undef = DAG.getUNDEF(VT);
8513    OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
8514    OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
8515
8516    // The MOVLHPS mask:
8517    static const int ShufMask2[] = {0, 1, 4, 5};
8518    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
8519  }
8520
8521  if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
8522    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
8523    if (Subtarget->hasInt256()) {
8524      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
8525
8526      SmallVector<SDValue,32> pshufbMask;
8527      for (unsigned i = 0; i < 2; ++i) {
8528        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
8529        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
8530        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
8531        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
8532        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
8533        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
8534        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
8535        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
8536        for (unsigned j = 0; j < 8; ++j)
8537          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
8538      }
8539      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
8540                               &pshufbMask[0], 32);
8541      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
8542      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
8543
8544      static const int ShufMask[] = {0,  2,  -1,  -1};
8545      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
8546                                &ShufMask[0]);
8547      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
8548                       DAG.getIntPtrConstant(0));
8549      return DAG.getNode(ISD::BITCAST, DL, VT, In);
8550    }
8551
8552    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
8553                               DAG.getIntPtrConstant(0));
8554
8555    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
8556                               DAG.getIntPtrConstant(4));
8557
8558    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
8559    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
8560
8561    // The PSHUFB mask:
8562    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
8563                                   -1, -1, -1, -1, -1, -1, -1, -1};
8564
8565    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
8566    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
8567    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
8568
8569    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
8570    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
8571
8572    // The MOVLHPS Mask:
8573    static const int ShufMask2[] = {0, 1, 4, 5};
8574    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
8575    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
8576  }
8577
8578  // Handle truncation of V256 to V128 using shuffles.
8579  if (!VT.is128BitVector() || !SVT.is256BitVector())
8580    return SDValue();
8581
8582  assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
8583         "Invalid op");
8584  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
8585
8586  unsigned NumElems = VT.getVectorNumElements();
8587  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
8588                             NumElems * 2);
8589
8590  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
8591  // Prepare truncation shuffle mask
8592  for (unsigned i = 0; i != NumElems; ++i)
8593    MaskVec[i] = i * 2;
8594  SDValue V = DAG.getVectorShuffle(NVT, DL,
8595                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
8596                                   DAG.getUNDEF(NVT), &MaskVec[0]);
8597  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
8598                     DAG.getIntPtrConstant(0));
8599}
8600
8601SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
8602                                           SelectionDAG &DAG) const {
8603  MVT VT = Op.getValueType().getSimpleVT();
8604  if (VT.isVector()) {
8605    if (VT == MVT::v8i16)
8606      return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
8607                         DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
8608                                     MVT::v8i32, Op.getOperand(0)));
8609    return SDValue();
8610  }
8611
8612  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8613    /*IsSigned=*/ true, /*IsReplace=*/ false);
8614  SDValue FIST = Vals.first, StackSlot = Vals.second;
8615  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
8616  if (FIST.getNode() == 0) return Op;
8617
8618  if (StackSlot.getNode())
8619    // Load the result.
8620    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
8621                       FIST, StackSlot, MachinePointerInfo(),
8622                       false, false, false, 0);
8623
8624  // The node is the result.
8625  return FIST;
8626}
8627
8628SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
8629                                           SelectionDAG &DAG) const {
8630  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8631    /*IsSigned=*/ false, /*IsReplace=*/ false);
8632  SDValue FIST = Vals.first, StackSlot = Vals.second;
8633  assert(FIST.getNode() && "Unexpected failure");
8634
8635  if (StackSlot.getNode())
8636    // Load the result.
8637    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
8638                       FIST, StackSlot, MachinePointerInfo(),
8639                       false, false, false, 0);
8640
8641  // The node is the result.
8642  return FIST;
8643}
8644
8645static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
8646  SDLoc DL(Op);
8647  MVT VT = Op.getValueType().getSimpleVT();
8648  SDValue In = Op.getOperand(0);
8649  MVT SVT = In.getValueType().getSimpleVT();
8650
8651  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
8652
8653  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
8654                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
8655                                 In, DAG.getUNDEF(SVT)));
8656}
8657
8658SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
8659  LLVMContext *Context = DAG.getContext();
8660  SDLoc dl(Op);
8661  MVT VT = Op.getValueType().getSimpleVT();
8662  MVT EltVT = VT;
8663  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8664  if (VT.isVector()) {
8665    EltVT = VT.getVectorElementType();
8666    NumElts = VT.getVectorNumElements();
8667  }
8668  Constant *C;
8669  if (EltVT == MVT::f64)
8670    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
8671                                          APInt(64, ~(1ULL << 63))));
8672  else
8673    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
8674                                          APInt(32, ~(1U << 31))));
8675  C = ConstantVector::getSplat(NumElts, C);
8676  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
8677  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
8678  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8679                             MachinePointerInfo::getConstantPool(),
8680                             false, false, false, Alignment);
8681  if (VT.isVector()) {
8682    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8683    return DAG.getNode(ISD::BITCAST, dl, VT,
8684                       DAG.getNode(ISD::AND, dl, ANDVT,
8685                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
8686                                               Op.getOperand(0)),
8687                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
8688  }
8689  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
8690}
8691
8692SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
8693  LLVMContext *Context = DAG.getContext();
8694  SDLoc dl(Op);
8695  MVT VT = Op.getValueType().getSimpleVT();
8696  MVT EltVT = VT;
8697  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8698  if (VT.isVector()) {
8699    EltVT = VT.getVectorElementType();
8700    NumElts = VT.getVectorNumElements();
8701  }
8702  Constant *C;
8703  if (EltVT == MVT::f64)
8704    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
8705                                          APInt(64, 1ULL << 63)));
8706  else
8707    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
8708                                          APInt(32, 1U << 31)));
8709  C = ConstantVector::getSplat(NumElts, C);
8710  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
8711  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
8712  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8713                             MachinePointerInfo::getConstantPool(),
8714                             false, false, false, Alignment);
8715  if (VT.isVector()) {
8716    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8717    return DAG.getNode(ISD::BITCAST, dl, VT,
8718                       DAG.getNode(ISD::XOR, dl, XORVT,
8719                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
8720                                               Op.getOperand(0)),
8721                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
8722  }
8723
8724  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
8725}
8726
8727SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8728  LLVMContext *Context = DAG.getContext();
8729  SDValue Op0 = Op.getOperand(0);
8730  SDValue Op1 = Op.getOperand(1);
8731  SDLoc dl(Op);
8732  MVT VT = Op.getValueType().getSimpleVT();
8733  MVT SrcVT = Op1.getValueType().getSimpleVT();
8734
8735  // If second operand is smaller, extend it first.
8736  if (SrcVT.bitsLT(VT)) {
8737    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
8738    SrcVT = VT;
8739  }
8740  // And if it is bigger, shrink it first.
8741  if (SrcVT.bitsGT(VT)) {
8742    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
8743    SrcVT = VT;
8744  }
8745
8746  // At this point the operands and the result should have the same
8747  // type, and that won't be f80 since that is not custom lowered.
8748
8749  // First get the sign bit of second operand.
8750  SmallVector<Constant*,4> CV;
8751  if (SrcVT == MVT::f64) {
8752    const fltSemantics &Sem = APFloat::IEEEdouble;
8753    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
8754    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
8755  } else {
8756    const fltSemantics &Sem = APFloat::IEEEsingle;
8757    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
8758    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
8759    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
8760    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
8761  }
8762  Constant *C = ConstantVector::get(CV);
8763  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8764  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
8765                              MachinePointerInfo::getConstantPool(),
8766                              false, false, false, 16);
8767  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
8768
8769  // Shift sign bit right or left if the two operands have different types.
8770  if (SrcVT.bitsGT(VT)) {
8771    // Op0 is MVT::f32, Op1 is MVT::f64.
8772    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
8773    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
8774                          DAG.getConstant(32, MVT::i32));
8775    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
8776    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
8777                          DAG.getIntPtrConstant(0));
8778  }
8779
8780  // Clear first operand sign bit.
8781  CV.clear();
8782  if (VT == MVT::f64) {
8783    const fltSemantics &Sem = APFloat::IEEEdouble;
8784    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
8785                                                   APInt(64, ~(1ULL << 63)))));
8786    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
8787  } else {
8788    const fltSemantics &Sem = APFloat::IEEEsingle;
8789    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
8790                                                   APInt(32, ~(1U << 31)))));
8791    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
8792    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
8793    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
8794  }
8795  C = ConstantVector::get(CV);
8796  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8797  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8798                              MachinePointerInfo::getConstantPool(),
8799                              false, false, false, 16);
8800  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
8801
8802  // Or the value with the sign bit.
8803  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
8804}
8805
8806static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
8807  SDValue N0 = Op.getOperand(0);
8808  SDLoc dl(Op);
8809  MVT VT = Op.getValueType().getSimpleVT();
8810
8811  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
8812  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
8813                                  DAG.getConstant(1, VT));
8814  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
8815}
8816
8817// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
8818//
8819SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
8820                                                  SelectionDAG &DAG) const {
8821  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
8822
8823  if (!Subtarget->hasSSE41())
8824    return SDValue();
8825
8826  if (!Op->hasOneUse())
8827    return SDValue();
8828
8829  SDNode *N = Op.getNode();
8830  SDLoc DL(N);
8831
8832  SmallVector<SDValue, 8> Opnds;
8833  DenseMap<SDValue, unsigned> VecInMap;
8834  EVT VT = MVT::Other;
8835
8836  // Recognize a special case where a vector is casted into wide integer to
8837  // test all 0s.
8838  Opnds.push_back(N->getOperand(0));
8839  Opnds.push_back(N->getOperand(1));
8840
8841  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
8842    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
8843    // BFS traverse all OR'd operands.
8844    if (I->getOpcode() == ISD::OR) {
8845      Opnds.push_back(I->getOperand(0));
8846      Opnds.push_back(I->getOperand(1));
8847      // Re-evaluate the number of nodes to be traversed.
8848      e += 2; // 2 more nodes (LHS and RHS) are pushed.
8849      continue;
8850    }
8851
8852    // Quit if a non-EXTRACT_VECTOR_ELT
8853    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8854      return SDValue();
8855
8856    // Quit if without a constant index.
8857    SDValue Idx = I->getOperand(1);
8858    if (!isa<ConstantSDNode>(Idx))
8859      return SDValue();
8860
8861    SDValue ExtractedFromVec = I->getOperand(0);
8862    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
8863    if (M == VecInMap.end()) {
8864      VT = ExtractedFromVec.getValueType();
8865      // Quit if not 128/256-bit vector.
8866      if (!VT.is128BitVector() && !VT.is256BitVector())
8867        return SDValue();
8868      // Quit if not the same type.
8869      if (VecInMap.begin() != VecInMap.end() &&
8870          VT != VecInMap.begin()->first.getValueType())
8871        return SDValue();
8872      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
8873    }
8874    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
8875  }
8876
8877  assert((VT.is128BitVector() || VT.is256BitVector()) &&
8878         "Not extracted from 128-/256-bit vector.");
8879
8880  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
8881  SmallVector<SDValue, 8> VecIns;
8882
8883  for (DenseMap<SDValue, unsigned>::const_iterator
8884        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
8885    // Quit if not all elements are used.
8886    if (I->second != FullMask)
8887      return SDValue();
8888    VecIns.push_back(I->first);
8889  }
8890
8891  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8892
8893  // Cast all vectors into TestVT for PTEST.
8894  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
8895    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
8896
8897  // If more than one full vectors are evaluated, OR them first before PTEST.
8898  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
8899    // Each iteration will OR 2 nodes and append the result until there is only
8900    // 1 node left, i.e. the final OR'd value of all vectors.
8901    SDValue LHS = VecIns[Slot];
8902    SDValue RHS = VecIns[Slot + 1];
8903    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
8904  }
8905
8906  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
8907                     VecIns.back(), VecIns.back());
8908}
8909
8910/// Emit nodes that will be selected as "test Op0,Op0", or something
8911/// equivalent.
8912SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
8913                                    SelectionDAG &DAG) const {
8914  SDLoc dl(Op);
8915
8916  // CF and OF aren't always set the way we want. Determine which
8917  // of these we need.
8918  bool NeedCF = false;
8919  bool NeedOF = false;
8920  switch (X86CC) {
8921  default: break;
8922  case X86::COND_A: case X86::COND_AE:
8923  case X86::COND_B: case X86::COND_BE:
8924    NeedCF = true;
8925    break;
8926  case X86::COND_G: case X86::COND_GE:
8927  case X86::COND_L: case X86::COND_LE:
8928  case X86::COND_O: case X86::COND_NO:
8929    NeedOF = true;
8930    break;
8931  }
8932
8933  // See if we can use the EFLAGS value from the operand instead of
8934  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
8935  // we prove that the arithmetic won't overflow, we can't use OF or CF.
8936  if (Op.getResNo() != 0 || NeedOF || NeedCF)
8937    // Emit a CMP with 0, which is the TEST pattern.
8938    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8939                       DAG.getConstant(0, Op.getValueType()));
8940
8941  unsigned Opcode = 0;
8942  unsigned NumOperands = 0;
8943
8944  // Truncate operations may prevent the merge of the SETCC instruction
8945  // and the arithmetic intruction before it. Attempt to truncate the operands
8946  // of the arithmetic instruction and use a reduced bit-width instruction.
8947  bool NeedTruncation = false;
8948  SDValue ArithOp = Op;
8949  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
8950    SDValue Arith = Op->getOperand(0);
8951    // Both the trunc and the arithmetic op need to have one user each.
8952    if (Arith->hasOneUse())
8953      switch (Arith.getOpcode()) {
8954        default: break;
8955        case ISD::ADD:
8956        case ISD::SUB:
8957        case ISD::AND:
8958        case ISD::OR:
8959        case ISD::XOR: {
8960          NeedTruncation = true;
8961          ArithOp = Arith;
8962        }
8963      }
8964  }
8965
8966  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
8967  // which may be the result of a CAST.  We use the variable 'Op', which is the
8968  // non-casted variable when we check for possible users.
8969  switch (ArithOp.getOpcode()) {
8970  case ISD::ADD:
8971    // Due to an isel shortcoming, be conservative if this add is likely to be
8972    // selected as part of a load-modify-store instruction. When the root node
8973    // in a match is a store, isel doesn't know how to remap non-chain non-flag
8974    // uses of other nodes in the match, such as the ADD in this case. This
8975    // leads to the ADD being left around and reselected, with the result being
8976    // two adds in the output.  Alas, even if none our users are stores, that
8977    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
8978    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
8979    // climbing the DAG back to the root, and it doesn't seem to be worth the
8980    // effort.
8981    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8982         UE = Op.getNode()->use_end(); UI != UE; ++UI)
8983      if (UI->getOpcode() != ISD::CopyToReg &&
8984          UI->getOpcode() != ISD::SETCC &&
8985          UI->getOpcode() != ISD::STORE)
8986        goto default_case;
8987
8988    if (ConstantSDNode *C =
8989        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
8990      // An add of one will be selected as an INC.
8991      if (C->getAPIntValue() == 1) {
8992        Opcode = X86ISD::INC;
8993        NumOperands = 1;
8994        break;
8995      }
8996
8997      // An add of negative one (subtract of one) will be selected as a DEC.
8998      if (C->getAPIntValue().isAllOnesValue()) {
8999        Opcode = X86ISD::DEC;
9000        NumOperands = 1;
9001        break;
9002      }
9003    }
9004
9005    // Otherwise use a regular EFLAGS-setting add.
9006    Opcode = X86ISD::ADD;
9007    NumOperands = 2;
9008    break;
9009  case ISD::AND: {
9010    // If the primary and result isn't used, don't bother using X86ISD::AND,
9011    // because a TEST instruction will be better.
9012    bool NonFlagUse = false;
9013    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9014           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
9015      SDNode *User = *UI;
9016      unsigned UOpNo = UI.getOperandNo();
9017      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
9018        // Look pass truncate.
9019        UOpNo = User->use_begin().getOperandNo();
9020        User = *User->use_begin();
9021      }
9022
9023      if (User->getOpcode() != ISD::BRCOND &&
9024          User->getOpcode() != ISD::SETCC &&
9025          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
9026        NonFlagUse = true;
9027        break;
9028      }
9029    }
9030
9031    if (!NonFlagUse)
9032      break;
9033  }
9034    // FALL THROUGH
9035  case ISD::SUB:
9036  case ISD::OR:
9037  case ISD::XOR:
9038    // Due to the ISEL shortcoming noted above, be conservative if this op is
9039    // likely to be selected as part of a load-modify-store instruction.
9040    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9041           UE = Op.getNode()->use_end(); UI != UE; ++UI)
9042      if (UI->getOpcode() == ISD::STORE)
9043        goto default_case;
9044
9045    // Otherwise use a regular EFLAGS-setting instruction.
9046    switch (ArithOp.getOpcode()) {
9047    default: llvm_unreachable("unexpected operator!");
9048    case ISD::SUB: Opcode = X86ISD::SUB; break;
9049    case ISD::XOR: Opcode = X86ISD::XOR; break;
9050    case ISD::AND: Opcode = X86ISD::AND; break;
9051    case ISD::OR: {
9052      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
9053        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
9054        if (EFLAGS.getNode())
9055          return EFLAGS;
9056      }
9057      Opcode = X86ISD::OR;
9058      break;
9059    }
9060    }
9061
9062    NumOperands = 2;
9063    break;
9064  case X86ISD::ADD:
9065  case X86ISD::SUB:
9066  case X86ISD::INC:
9067  case X86ISD::DEC:
9068  case X86ISD::OR:
9069  case X86ISD::XOR:
9070  case X86ISD::AND:
9071    return SDValue(Op.getNode(), 1);
9072  default:
9073  default_case:
9074    break;
9075  }
9076
9077  // If we found that truncation is beneficial, perform the truncation and
9078  // update 'Op'.
9079  if (NeedTruncation) {
9080    EVT VT = Op.getValueType();
9081    SDValue WideVal = Op->getOperand(0);
9082    EVT WideVT = WideVal.getValueType();
9083    unsigned ConvertedOp = 0;
9084    // Use a target machine opcode to prevent further DAGCombine
9085    // optimizations that may separate the arithmetic operations
9086    // from the setcc node.
9087    switch (WideVal.getOpcode()) {
9088      default: break;
9089      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
9090      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
9091      case ISD::AND: ConvertedOp = X86ISD::AND; break;
9092      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
9093      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
9094    }
9095
9096    if (ConvertedOp) {
9097      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9098      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
9099        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
9100        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
9101        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
9102      }
9103    }
9104  }
9105
9106  if (Opcode == 0)
9107    // Emit a CMP with 0, which is the TEST pattern.
9108    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
9109                       DAG.getConstant(0, Op.getValueType()));
9110
9111  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
9112  SmallVector<SDValue, 4> Ops;
9113  for (unsigned i = 0; i != NumOperands; ++i)
9114    Ops.push_back(Op.getOperand(i));
9115
9116  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
9117  DAG.ReplaceAllUsesWith(Op, New);
9118  return SDValue(New.getNode(), 1);
9119}
9120
9121/// Emit nodes that will be selected as "cmp Op0,Op1", or something
9122/// equivalent.
9123SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
9124                                   SelectionDAG &DAG) const {
9125  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
9126    if (C->getAPIntValue() == 0)
9127      return EmitTest(Op0, X86CC, DAG);
9128
9129  SDLoc dl(Op0);
9130  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
9131       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
9132    // Use SUB instead of CMP to enable CSE between SUB and CMP.
9133    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
9134    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
9135                              Op0, Op1);
9136    return SDValue(Sub.getNode(), 1);
9137  }
9138  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
9139}
9140
9141/// Convert a comparison if required by the subtarget.
9142SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
9143                                                 SelectionDAG &DAG) const {
9144  // If the subtarget does not support the FUCOMI instruction, floating-point
9145  // comparisons have to be converted.
9146  if (Subtarget->hasCMov() ||
9147      Cmp.getOpcode() != X86ISD::CMP ||
9148      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
9149      !Cmp.getOperand(1).getValueType().isFloatingPoint())
9150    return Cmp;
9151
9152  // The instruction selector will select an FUCOM instruction instead of
9153  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
9154  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
9155  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
9156  SDLoc dl(Cmp);
9157  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
9158  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
9159  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
9160                            DAG.getConstant(8, MVT::i8));
9161  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
9162  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
9163}
9164
9165static bool isAllOnes(SDValue V) {
9166  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
9167  return C && C->isAllOnesValue();
9168}
9169
9170/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
9171/// if it's possible.
9172SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
9173                                     SDLoc dl, SelectionDAG &DAG) const {
9174  SDValue Op0 = And.getOperand(0);
9175  SDValue Op1 = And.getOperand(1);
9176  if (Op0.getOpcode() == ISD::TRUNCATE)
9177    Op0 = Op0.getOperand(0);
9178  if (Op1.getOpcode() == ISD::TRUNCATE)
9179    Op1 = Op1.getOperand(0);
9180
9181  SDValue LHS, RHS;
9182  if (Op1.getOpcode() == ISD::SHL)
9183    std::swap(Op0, Op1);
9184  if (Op0.getOpcode() == ISD::SHL) {
9185    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
9186      if (And00C->getZExtValue() == 1) {
9187        // If we looked past a truncate, check that it's only truncating away
9188        // known zeros.
9189        unsigned BitWidth = Op0.getValueSizeInBits();
9190        unsigned AndBitWidth = And.getValueSizeInBits();
9191        if (BitWidth > AndBitWidth) {
9192          APInt Zeros, Ones;
9193          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
9194          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
9195            return SDValue();
9196        }
9197        LHS = Op1;
9198        RHS = Op0.getOperand(1);
9199      }
9200  } else if (Op1.getOpcode() == ISD::Constant) {
9201    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
9202    uint64_t AndRHSVal = AndRHS->getZExtValue();
9203    SDValue AndLHS = Op0;
9204
9205    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
9206      LHS = AndLHS.getOperand(0);
9207      RHS = AndLHS.getOperand(1);
9208    }
9209
9210    // Use BT if the immediate can't be encoded in a TEST instruction.
9211    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
9212      LHS = AndLHS;
9213      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
9214    }
9215  }
9216
9217  if (LHS.getNode()) {
9218    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
9219    // instruction.  Since the shift amount is in-range-or-undefined, we know
9220    // that doing a bittest on the i32 value is ok.  We extend to i32 because
9221    // the encoding for the i16 version is larger than the i32 version.
9222    // Also promote i16 to i32 for performance / code size reason.
9223    if (LHS.getValueType() == MVT::i8 ||
9224        LHS.getValueType() == MVT::i16)
9225      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
9226
9227    // If the operand types disagree, extend the shift amount to match.  Since
9228    // BT ignores high bits (like shifts) we can use anyextend.
9229    if (LHS.getValueType() != RHS.getValueType())
9230      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
9231
9232    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
9233    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
9234    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9235                       DAG.getConstant(Cond, MVT::i8), BT);
9236  }
9237
9238  return SDValue();
9239}
9240
9241// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
9242// ones, and then concatenate the result back.
9243static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
9244  MVT VT = Op.getValueType().getSimpleVT();
9245
9246  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
9247         "Unsupported value type for operation");
9248
9249  unsigned NumElems = VT.getVectorNumElements();
9250  SDLoc dl(Op);
9251  SDValue CC = Op.getOperand(2);
9252
9253  // Extract the LHS vectors
9254  SDValue LHS = Op.getOperand(0);
9255  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
9256  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
9257
9258  // Extract the RHS vectors
9259  SDValue RHS = Op.getOperand(1);
9260  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
9261  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
9262
9263  // Issue the operation on the smaller types and concatenate the result back
9264  MVT EltVT = VT.getVectorElementType();
9265  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
9266  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
9267                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
9268                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
9269}
9270
9271static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
9272                           SelectionDAG &DAG) {
9273  SDValue Cond;
9274  SDValue Op0 = Op.getOperand(0);
9275  SDValue Op1 = Op.getOperand(1);
9276  SDValue CC = Op.getOperand(2);
9277  MVT VT = Op.getValueType().getSimpleVT();
9278  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9279  bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
9280  SDLoc dl(Op);
9281
9282  if (isFP) {
9283#ifndef NDEBUG
9284    MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
9285    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
9286#endif
9287
9288    unsigned SSECC;
9289    bool Swap = false;
9290
9291    // SSE Condition code mapping:
9292    //  0 - EQ
9293    //  1 - LT
9294    //  2 - LE
9295    //  3 - UNORD
9296    //  4 - NEQ
9297    //  5 - NLT
9298    //  6 - NLE
9299    //  7 - ORD
9300    switch (SetCCOpcode) {
9301    default: llvm_unreachable("Unexpected SETCC condition");
9302    case ISD::SETOEQ:
9303    case ISD::SETEQ:  SSECC = 0; break;
9304    case ISD::SETOGT:
9305    case ISD::SETGT: Swap = true; // Fallthrough
9306    case ISD::SETLT:
9307    case ISD::SETOLT: SSECC = 1; break;
9308    case ISD::SETOGE:
9309    case ISD::SETGE: Swap = true; // Fallthrough
9310    case ISD::SETLE:
9311    case ISD::SETOLE: SSECC = 2; break;
9312    case ISD::SETUO:  SSECC = 3; break;
9313    case ISD::SETUNE:
9314    case ISD::SETNE:  SSECC = 4; break;
9315    case ISD::SETULE: Swap = true; // Fallthrough
9316    case ISD::SETUGE: SSECC = 5; break;
9317    case ISD::SETULT: Swap = true; // Fallthrough
9318    case ISD::SETUGT: SSECC = 6; break;
9319    case ISD::SETO:   SSECC = 7; break;
9320    case ISD::SETUEQ:
9321    case ISD::SETONE: SSECC = 8; break;
9322    }
9323    if (Swap)
9324      std::swap(Op0, Op1);
9325
9326    // In the two special cases we can't handle, emit two comparisons.
9327    if (SSECC == 8) {
9328      unsigned CC0, CC1;
9329      unsigned CombineOpc;
9330      if (SetCCOpcode == ISD::SETUEQ) {
9331        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
9332      } else {
9333        assert(SetCCOpcode == ISD::SETONE);
9334        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
9335      }
9336
9337      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9338                                 DAG.getConstant(CC0, MVT::i8));
9339      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9340                                 DAG.getConstant(CC1, MVT::i8));
9341      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
9342    }
9343    // Handle all other FP comparisons here.
9344    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9345                       DAG.getConstant(SSECC, MVT::i8));
9346  }
9347
9348  // Break 256-bit integer vector compare into smaller ones.
9349  if (VT.is256BitVector() && !Subtarget->hasInt256())
9350    return Lower256IntVSETCC(Op, DAG);
9351
9352  // We are handling one of the integer comparisons here.  Since SSE only has
9353  // GT and EQ comparisons for integer, swapping operands and multiple
9354  // operations may be required for some comparisons.
9355  unsigned Opc;
9356  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
9357
9358  switch (SetCCOpcode) {
9359  default: llvm_unreachable("Unexpected SETCC condition");
9360  case ISD::SETNE:  Invert = true;
9361  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
9362  case ISD::SETLT:  Swap = true;
9363  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
9364  case ISD::SETGE:  Swap = true;
9365  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
9366  case ISD::SETULT: Swap = true;
9367  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
9368  case ISD::SETUGE: Swap = true;
9369  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
9370  }
9371
9372  // Special case: Use min/max operations for SETULE/SETUGE
9373  MVT VET = VT.getVectorElementType();
9374  bool hasMinMax =
9375       (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
9376    || (Subtarget->hasSSE2()  && (VET == MVT::i8));
9377
9378  if (hasMinMax) {
9379    switch (SetCCOpcode) {
9380    default: break;
9381    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
9382    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
9383    }
9384
9385    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
9386  }
9387
9388  if (Swap)
9389    std::swap(Op0, Op1);
9390
9391  // Check that the operation in question is available (most are plain SSE2,
9392  // but PCMPGTQ and PCMPEQQ have different requirements).
9393  if (VT == MVT::v2i64) {
9394    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
9395      assert(Subtarget->hasSSE2() && "Don't know how to lower!");
9396
9397      // First cast everything to the right type.
9398      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
9399      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
9400
9401      // Since SSE has no unsigned integer comparisons, we need to flip the sign
9402      // bits of the inputs before performing those operations. The lower
9403      // compare is always unsigned.
9404      SDValue SB;
9405      if (FlipSigns) {
9406        SB = DAG.getConstant(0x80000000U, MVT::v4i32);
9407      } else {
9408        SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
9409        SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
9410        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
9411                         Sign, Zero, Sign, Zero);
9412      }
9413      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
9414      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
9415
9416      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
9417      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
9418      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
9419
9420      // Create masks for only the low parts/high parts of the 64 bit integers.
9421      static const int MaskHi[] = { 1, 1, 3, 3 };
9422      static const int MaskLo[] = { 0, 0, 2, 2 };
9423      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
9424      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
9425      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
9426
9427      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
9428      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
9429
9430      if (Invert)
9431        Result = DAG.getNOT(dl, Result, MVT::v4i32);
9432
9433      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9434    }
9435
9436    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
9437      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
9438      // pcmpeqd + pshufd + pand.
9439      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
9440
9441      // First cast everything to the right type.
9442      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
9443      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
9444
9445      // Do the compare.
9446      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
9447
9448      // Make sure the lower and upper halves are both all-ones.
9449      static const int Mask[] = { 1, 0, 3, 2 };
9450      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
9451      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
9452
9453      if (Invert)
9454        Result = DAG.getNOT(dl, Result, MVT::v4i32);
9455
9456      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9457    }
9458  }
9459
9460  // Since SSE has no unsigned integer comparisons, we need to flip the sign
9461  // bits of the inputs before performing those operations.
9462  if (FlipSigns) {
9463    EVT EltVT = VT.getVectorElementType();
9464    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
9465    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
9466    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
9467  }
9468
9469  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
9470
9471  // If the logical-not of the result is required, perform that now.
9472  if (Invert)
9473    Result = DAG.getNOT(dl, Result, VT);
9474
9475  if (MinMax)
9476    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
9477
9478  return Result;
9479}
9480
9481SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9482
9483  MVT VT = Op.getValueType().getSimpleVT();
9484
9485  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
9486
9487  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
9488  SDValue Op0 = Op.getOperand(0);
9489  SDValue Op1 = Op.getOperand(1);
9490  SDLoc dl(Op);
9491  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
9492
9493  // Optimize to BT if possible.
9494  // Lower (X & (1 << N)) == 0 to BT(X, N).
9495  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
9496  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
9497  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
9498      Op1.getOpcode() == ISD::Constant &&
9499      cast<ConstantSDNode>(Op1)->isNullValue() &&
9500      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9501    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
9502    if (NewSetCC.getNode())
9503      return NewSetCC;
9504  }
9505
9506  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
9507  // these.
9508  if (Op1.getOpcode() == ISD::Constant &&
9509      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
9510       cast<ConstantSDNode>(Op1)->isNullValue()) &&
9511      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9512
9513    // If the input is a setcc, then reuse the input setcc or use a new one with
9514    // the inverted condition.
9515    if (Op0.getOpcode() == X86ISD::SETCC) {
9516      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
9517      bool Invert = (CC == ISD::SETNE) ^
9518        cast<ConstantSDNode>(Op1)->isNullValue();
9519      if (!Invert) return Op0;
9520
9521      CCode = X86::GetOppositeBranchCondition(CCode);
9522      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9523                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
9524    }
9525  }
9526
9527  bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
9528  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
9529  if (X86CC == X86::COND_INVALID)
9530    return SDValue();
9531
9532  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
9533  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
9534  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9535                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
9536}
9537
9538// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
9539static bool isX86LogicalCmp(SDValue Op) {
9540  unsigned Opc = Op.getNode()->getOpcode();
9541  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
9542      Opc == X86ISD::SAHF)
9543    return true;
9544  if (Op.getResNo() == 1 &&
9545      (Opc == X86ISD::ADD ||
9546       Opc == X86ISD::SUB ||
9547       Opc == X86ISD::ADC ||
9548       Opc == X86ISD::SBB ||
9549       Opc == X86ISD::SMUL ||
9550       Opc == X86ISD::UMUL ||
9551       Opc == X86ISD::INC ||
9552       Opc == X86ISD::DEC ||
9553       Opc == X86ISD::OR ||
9554       Opc == X86ISD::XOR ||
9555       Opc == X86ISD::AND))
9556    return true;
9557
9558  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
9559    return true;
9560
9561  return false;
9562}
9563
9564static bool isZero(SDValue V) {
9565  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
9566  return C && C->isNullValue();
9567}
9568
9569static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
9570  if (V.getOpcode() != ISD::TRUNCATE)
9571    return false;
9572
9573  SDValue VOp0 = V.getOperand(0);
9574  unsigned InBits = VOp0.getValueSizeInBits();
9575  unsigned Bits = V.getValueSizeInBits();
9576  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
9577}
9578
9579SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
9580  bool addTest = true;
9581  SDValue Cond  = Op.getOperand(0);
9582  SDValue Op1 = Op.getOperand(1);
9583  SDValue Op2 = Op.getOperand(2);
9584  SDLoc DL(Op);
9585  SDValue CC;
9586
9587  if (Cond.getOpcode() == ISD::SETCC) {
9588    SDValue NewCond = LowerSETCC(Cond, DAG);
9589    if (NewCond.getNode())
9590      Cond = NewCond;
9591  }
9592
9593  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
9594  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
9595  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
9596  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
9597  if (Cond.getOpcode() == X86ISD::SETCC &&
9598      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
9599      isZero(Cond.getOperand(1).getOperand(1))) {
9600    SDValue Cmp = Cond.getOperand(1);
9601
9602    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
9603
9604    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
9605        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
9606      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
9607
9608      SDValue CmpOp0 = Cmp.getOperand(0);
9609      // Apply further optimizations for special cases
9610      // (select (x != 0), -1, 0) -> neg & sbb
9611      // (select (x == 0), 0, -1) -> neg & sbb
9612      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
9613        if (YC->isNullValue() &&
9614            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
9615          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
9616          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
9617                                    DAG.getConstant(0, CmpOp0.getValueType()),
9618                                    CmpOp0);
9619          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9620                                    DAG.getConstant(X86::COND_B, MVT::i8),
9621                                    SDValue(Neg.getNode(), 1));
9622          return Res;
9623        }
9624
9625      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
9626                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
9627      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9628
9629      SDValue Res =   // Res = 0 or -1.
9630        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9631                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
9632
9633      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
9634        Res = DAG.getNOT(DL, Res, Res.getValueType());
9635
9636      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
9637      if (N2C == 0 || !N2C->isNullValue())
9638        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
9639      return Res;
9640    }
9641  }
9642
9643  // Look past (and (setcc_carry (cmp ...)), 1).
9644  if (Cond.getOpcode() == ISD::AND &&
9645      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9646    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9647    if (C && C->getAPIntValue() == 1)
9648      Cond = Cond.getOperand(0);
9649  }
9650
9651  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9652  // setting operand in place of the X86ISD::SETCC.
9653  unsigned CondOpcode = Cond.getOpcode();
9654  if (CondOpcode == X86ISD::SETCC ||
9655      CondOpcode == X86ISD::SETCC_CARRY) {
9656    CC = Cond.getOperand(0);
9657
9658    SDValue Cmp = Cond.getOperand(1);
9659    unsigned Opc = Cmp.getOpcode();
9660    MVT VT = Op.getValueType().getSimpleVT();
9661
9662    bool IllegalFPCMov = false;
9663    if (VT.isFloatingPoint() && !VT.isVector() &&
9664        !isScalarFPTypeInSSEReg(VT))  // FPStack?
9665      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
9666
9667    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
9668        Opc == X86ISD::BT) { // FIXME
9669      Cond = Cmp;
9670      addTest = false;
9671    }
9672  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9673             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9674             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9675              Cond.getOperand(0).getValueType() != MVT::i8)) {
9676    SDValue LHS = Cond.getOperand(0);
9677    SDValue RHS = Cond.getOperand(1);
9678    unsigned X86Opcode;
9679    unsigned X86Cond;
9680    SDVTList VTs;
9681    switch (CondOpcode) {
9682    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9683    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9684    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9685    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9686    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9687    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9688    default: llvm_unreachable("unexpected overflowing operator");
9689    }
9690    if (CondOpcode == ISD::UMULO)
9691      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9692                          MVT::i32);
9693    else
9694      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9695
9696    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
9697
9698    if (CondOpcode == ISD::UMULO)
9699      Cond = X86Op.getValue(2);
9700    else
9701      Cond = X86Op.getValue(1);
9702
9703    CC = DAG.getConstant(X86Cond, MVT::i8);
9704    addTest = false;
9705  }
9706
9707  if (addTest) {
9708    // Look pass the truncate if the high bits are known zero.
9709    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9710        Cond = Cond.getOperand(0);
9711
9712    // We know the result of AND is compared against zero. Try to match
9713    // it to BT.
9714    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9715      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
9716      if (NewSetCC.getNode()) {
9717        CC = NewSetCC.getOperand(0);
9718        Cond = NewSetCC.getOperand(1);
9719        addTest = false;
9720      }
9721    }
9722  }
9723
9724  if (addTest) {
9725    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9726    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9727  }
9728
9729  // a <  b ? -1 :  0 -> RES = ~setcc_carry
9730  // a <  b ?  0 : -1 -> RES = setcc_carry
9731  // a >= b ? -1 :  0 -> RES = setcc_carry
9732  // a >= b ?  0 : -1 -> RES = ~setcc_carry
9733  if (Cond.getOpcode() == X86ISD::SUB) {
9734    Cond = ConvertCmpIfNecessary(Cond, DAG);
9735    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
9736
9737    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
9738        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
9739      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9740                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
9741      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
9742        return DAG.getNOT(DL, Res, Res.getValueType());
9743      return Res;
9744    }
9745  }
9746
9747  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
9748  // widen the cmov and push the truncate through. This avoids introducing a new
9749  // branch during isel and doesn't add any extensions.
9750  if (Op.getValueType() == MVT::i8 &&
9751      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
9752    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
9753    if (T1.getValueType() == T2.getValueType() &&
9754        // Blacklist CopyFromReg to avoid partial register stalls.
9755        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
9756      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
9757      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
9758      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
9759    }
9760  }
9761
9762  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
9763  // condition is true.
9764  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
9765  SDValue Ops[] = { Op2, Op1, CC, Cond };
9766  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
9767}
9768
9769SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
9770                                            SelectionDAG &DAG) const {
9771  MVT VT = Op->getValueType(0).getSimpleVT();
9772  SDValue In = Op->getOperand(0);
9773  MVT InVT = In.getValueType().getSimpleVT();
9774  SDLoc dl(Op);
9775
9776  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
9777      (VT != MVT::v8i32 || InVT != MVT::v8i16))
9778    return SDValue();
9779
9780  if (Subtarget->hasInt256())
9781    return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
9782
9783  // Optimize vectors in AVX mode
9784  // Sign extend  v8i16 to v8i32 and
9785  //              v4i32 to v4i64
9786  //
9787  // Divide input vector into two parts
9788  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
9789  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
9790  // concat the vectors to original VT
9791
9792  unsigned NumElems = InVT.getVectorNumElements();
9793  SDValue Undef = DAG.getUNDEF(InVT);
9794
9795  SmallVector<int,8> ShufMask1(NumElems, -1);
9796  for (unsigned i = 0; i != NumElems/2; ++i)
9797    ShufMask1[i] = i;
9798
9799  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
9800
9801  SmallVector<int,8> ShufMask2(NumElems, -1);
9802  for (unsigned i = 0; i != NumElems/2; ++i)
9803    ShufMask2[i] = i + NumElems/2;
9804
9805  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
9806
9807  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
9808                                VT.getVectorNumElements()/2);
9809
9810  OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
9811  OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
9812
9813  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
9814}
9815
9816// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
9817// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
9818// from the AND / OR.
9819static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
9820  Opc = Op.getOpcode();
9821  if (Opc != ISD::OR && Opc != ISD::AND)
9822    return false;
9823  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9824          Op.getOperand(0).hasOneUse() &&
9825          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
9826          Op.getOperand(1).hasOneUse());
9827}
9828
9829// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
9830// 1 and that the SETCC node has a single use.
9831static bool isXor1OfSetCC(SDValue Op) {
9832  if (Op.getOpcode() != ISD::XOR)
9833    return false;
9834  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9835  if (N1C && N1C->getAPIntValue() == 1) {
9836    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9837      Op.getOperand(0).hasOneUse();
9838  }
9839  return false;
9840}
9841
9842SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
9843  bool addTest = true;
9844  SDValue Chain = Op.getOperand(0);
9845  SDValue Cond  = Op.getOperand(1);
9846  SDValue Dest  = Op.getOperand(2);
9847  SDLoc dl(Op);
9848  SDValue CC;
9849  bool Inverted = false;
9850
9851  if (Cond.getOpcode() == ISD::SETCC) {
9852    // Check for setcc([su]{add,sub,mul}o == 0).
9853    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
9854        isa<ConstantSDNode>(Cond.getOperand(1)) &&
9855        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
9856        Cond.getOperand(0).getResNo() == 1 &&
9857        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
9858         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
9859         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
9860         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
9861         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
9862         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
9863      Inverted = true;
9864      Cond = Cond.getOperand(0);
9865    } else {
9866      SDValue NewCond = LowerSETCC(Cond, DAG);
9867      if (NewCond.getNode())
9868        Cond = NewCond;
9869    }
9870  }
9871#if 0
9872  // FIXME: LowerXALUO doesn't handle these!!
9873  else if (Cond.getOpcode() == X86ISD::ADD  ||
9874           Cond.getOpcode() == X86ISD::SUB  ||
9875           Cond.getOpcode() == X86ISD::SMUL ||
9876           Cond.getOpcode() == X86ISD::UMUL)
9877    Cond = LowerXALUO(Cond, DAG);
9878#endif
9879
9880  // Look pass (and (setcc_carry (cmp ...)), 1).
9881  if (Cond.getOpcode() == ISD::AND &&
9882      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9883    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9884    if (C && C->getAPIntValue() == 1)
9885      Cond = Cond.getOperand(0);
9886  }
9887
9888  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9889  // setting operand in place of the X86ISD::SETCC.
9890  unsigned CondOpcode = Cond.getOpcode();
9891  if (CondOpcode == X86ISD::SETCC ||
9892      CondOpcode == X86ISD::SETCC_CARRY) {
9893    CC = Cond.getOperand(0);
9894
9895    SDValue Cmp = Cond.getOperand(1);
9896    unsigned Opc = Cmp.getOpcode();
9897    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
9898    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
9899      Cond = Cmp;
9900      addTest = false;
9901    } else {
9902      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
9903      default: break;
9904      case X86::COND_O:
9905      case X86::COND_B:
9906        // These can only come from an arithmetic instruction with overflow,
9907        // e.g. SADDO, UADDO.
9908        Cond = Cond.getNode()->getOperand(1);
9909        addTest = false;
9910        break;
9911      }
9912    }
9913  }
9914  CondOpcode = Cond.getOpcode();
9915  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9916      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9917      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9918       Cond.getOperand(0).getValueType() != MVT::i8)) {
9919    SDValue LHS = Cond.getOperand(0);
9920    SDValue RHS = Cond.getOperand(1);
9921    unsigned X86Opcode;
9922    unsigned X86Cond;
9923    SDVTList VTs;
9924    switch (CondOpcode) {
9925    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9926    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9927    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9928    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9929    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9930    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9931    default: llvm_unreachable("unexpected overflowing operator");
9932    }
9933    if (Inverted)
9934      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
9935    if (CondOpcode == ISD::UMULO)
9936      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9937                          MVT::i32);
9938    else
9939      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9940
9941    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
9942
9943    if (CondOpcode == ISD::UMULO)
9944      Cond = X86Op.getValue(2);
9945    else
9946      Cond = X86Op.getValue(1);
9947
9948    CC = DAG.getConstant(X86Cond, MVT::i8);
9949    addTest = false;
9950  } else {
9951    unsigned CondOpc;
9952    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
9953      SDValue Cmp = Cond.getOperand(0).getOperand(1);
9954      if (CondOpc == ISD::OR) {
9955        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
9956        // two branches instead of an explicit OR instruction with a
9957        // separate test.
9958        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9959            isX86LogicalCmp(Cmp)) {
9960          CC = Cond.getOperand(0).getOperand(0);
9961          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9962                              Chain, Dest, CC, Cmp);
9963          CC = Cond.getOperand(1).getOperand(0);
9964          Cond = Cmp;
9965          addTest = false;
9966        }
9967      } else { // ISD::AND
9968        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
9969        // two branches instead of an explicit AND instruction with a
9970        // separate test. However, we only do this if this block doesn't
9971        // have a fall-through edge, because this requires an explicit
9972        // jmp when the condition is false.
9973        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9974            isX86LogicalCmp(Cmp) &&
9975            Op.getNode()->hasOneUse()) {
9976          X86::CondCode CCode =
9977            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9978          CCode = X86::GetOppositeBranchCondition(CCode);
9979          CC = DAG.getConstant(CCode, MVT::i8);
9980          SDNode *User = *Op.getNode()->use_begin();
9981          // Look for an unconditional branch following this conditional branch.
9982          // We need this because we need to reverse the successors in order
9983          // to implement FCMP_OEQ.
9984          if (User->getOpcode() == ISD::BR) {
9985            SDValue FalseBB = User->getOperand(1);
9986            SDNode *NewBR =
9987              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9988            assert(NewBR == User);
9989            (void)NewBR;
9990            Dest = FalseBB;
9991
9992            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9993                                Chain, Dest, CC, Cmp);
9994            X86::CondCode CCode =
9995              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
9996            CCode = X86::GetOppositeBranchCondition(CCode);
9997            CC = DAG.getConstant(CCode, MVT::i8);
9998            Cond = Cmp;
9999            addTest = false;
10000          }
10001        }
10002      }
10003    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
10004      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
10005      // It should be transformed during dag combiner except when the condition
10006      // is set by a arithmetics with overflow node.
10007      X86::CondCode CCode =
10008        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
10009      CCode = X86::GetOppositeBranchCondition(CCode);
10010      CC = DAG.getConstant(CCode, MVT::i8);
10011      Cond = Cond.getOperand(0).getOperand(1);
10012      addTest = false;
10013    } else if (Cond.getOpcode() == ISD::SETCC &&
10014               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
10015      // For FCMP_OEQ, we can emit
10016      // two branches instead of an explicit AND instruction with a
10017      // separate test. However, we only do this if this block doesn't
10018      // have a fall-through edge, because this requires an explicit
10019      // jmp when the condition is false.
10020      if (Op.getNode()->hasOneUse()) {
10021        SDNode *User = *Op.getNode()->use_begin();
10022        // Look for an unconditional branch following this conditional branch.
10023        // We need this because we need to reverse the successors in order
10024        // to implement FCMP_OEQ.
10025        if (User->getOpcode() == ISD::BR) {
10026          SDValue FalseBB = User->getOperand(1);
10027          SDNode *NewBR =
10028            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10029          assert(NewBR == User);
10030          (void)NewBR;
10031          Dest = FalseBB;
10032
10033          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
10034                                    Cond.getOperand(0), Cond.getOperand(1));
10035          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10036          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10037          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10038                              Chain, Dest, CC, Cmp);
10039          CC = DAG.getConstant(X86::COND_P, MVT::i8);
10040          Cond = Cmp;
10041          addTest = false;
10042        }
10043      }
10044    } else if (Cond.getOpcode() == ISD::SETCC &&
10045               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
10046      // For FCMP_UNE, we can emit
10047      // two branches instead of an explicit AND instruction with a
10048      // separate test. However, we only do this if this block doesn't
10049      // have a fall-through edge, because this requires an explicit
10050      // jmp when the condition is false.
10051      if (Op.getNode()->hasOneUse()) {
10052        SDNode *User = *Op.getNode()->use_begin();
10053        // Look for an unconditional branch following this conditional branch.
10054        // We need this because we need to reverse the successors in order
10055        // to implement FCMP_UNE.
10056        if (User->getOpcode() == ISD::BR) {
10057          SDValue FalseBB = User->getOperand(1);
10058          SDNode *NewBR =
10059            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10060          assert(NewBR == User);
10061          (void)NewBR;
10062
10063          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
10064                                    Cond.getOperand(0), Cond.getOperand(1));
10065          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10066          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10067          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10068                              Chain, Dest, CC, Cmp);
10069          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
10070          Cond = Cmp;
10071          addTest = false;
10072          Dest = FalseBB;
10073        }
10074      }
10075    }
10076  }
10077
10078  if (addTest) {
10079    // Look pass the truncate if the high bits are known zero.
10080    if (isTruncWithZeroHighBitsInput(Cond, DAG))
10081        Cond = Cond.getOperand(0);
10082
10083    // We know the result of AND is compared against zero. Try to match
10084    // it to BT.
10085    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
10086      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
10087      if (NewSetCC.getNode()) {
10088        CC = NewSetCC.getOperand(0);
10089        Cond = NewSetCC.getOperand(1);
10090        addTest = false;
10091      }
10092    }
10093  }
10094
10095  if (addTest) {
10096    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10097    Cond = EmitTest(Cond, X86::COND_NE, DAG);
10098  }
10099  Cond = ConvertCmpIfNecessary(Cond, DAG);
10100  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10101                     Chain, Dest, CC, Cond);
10102}
10103
10104// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
10105// Calls to _alloca is needed to probe the stack when allocating more than 4k
10106// bytes in one go. Touching the stack at 4K increments is necessary to ensure
10107// that the guard pages used by the OS virtual memory manager are allocated in
10108// correct sequence.
10109SDValue
10110X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
10111                                           SelectionDAG &DAG) const {
10112  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
10113          getTargetMachine().Options.EnableSegmentedStacks) &&
10114         "This should be used only on Windows targets or when segmented stacks "
10115         "are being used");
10116  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
10117  SDLoc dl(Op);
10118
10119  // Get the inputs.
10120  SDValue Chain = Op.getOperand(0);
10121  SDValue Size  = Op.getOperand(1);
10122  // FIXME: Ensure alignment here
10123
10124  bool Is64Bit = Subtarget->is64Bit();
10125  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
10126
10127  if (getTargetMachine().Options.EnableSegmentedStacks) {
10128    MachineFunction &MF = DAG.getMachineFunction();
10129    MachineRegisterInfo &MRI = MF.getRegInfo();
10130
10131    if (Is64Bit) {
10132      // The 64 bit implementation of segmented stacks needs to clobber both r10
10133      // r11. This makes it impossible to use it along with nested parameters.
10134      const Function *F = MF.getFunction();
10135
10136      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
10137           I != E; ++I)
10138        if (I->hasNestAttr())
10139          report_fatal_error("Cannot use segmented stacks with functions that "
10140                             "have nested arguments.");
10141    }
10142
10143    const TargetRegisterClass *AddrRegClass =
10144      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
10145    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
10146    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
10147    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
10148                                DAG.getRegister(Vreg, SPTy));
10149    SDValue Ops1[2] = { Value, Chain };
10150    return DAG.getMergeValues(Ops1, 2, dl);
10151  } else {
10152    SDValue Flag;
10153    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
10154
10155    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
10156    Flag = Chain.getValue(1);
10157    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10158
10159    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
10160    Flag = Chain.getValue(1);
10161
10162    const X86RegisterInfo *RegInfo =
10163      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
10164    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
10165                               SPTy).getValue(1);
10166
10167    SDValue Ops1[2] = { Chain.getValue(0), Chain };
10168    return DAG.getMergeValues(Ops1, 2, dl);
10169  }
10170}
10171
10172SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
10173  MachineFunction &MF = DAG.getMachineFunction();
10174  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
10175
10176  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10177  SDLoc DL(Op);
10178
10179  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
10180    // vastart just stores the address of the VarArgsFrameIndex slot into the
10181    // memory location argument.
10182    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
10183                                   getPointerTy());
10184    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10185                        MachinePointerInfo(SV), false, false, 0);
10186  }
10187
10188  // __va_list_tag:
10189  //   gp_offset         (0 - 6 * 8)
10190  //   fp_offset         (48 - 48 + 8 * 16)
10191  //   overflow_arg_area (point to parameters coming in memory).
10192  //   reg_save_area
10193  SmallVector<SDValue, 8> MemOps;
10194  SDValue FIN = Op.getOperand(1);
10195  // Store gp_offset
10196  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
10197                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
10198                                               MVT::i32),
10199                               FIN, MachinePointerInfo(SV), false, false, 0);
10200  MemOps.push_back(Store);
10201
10202  // Store fp_offset
10203  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10204                    FIN, DAG.getIntPtrConstant(4));
10205  Store = DAG.getStore(Op.getOperand(0), DL,
10206                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
10207                                       MVT::i32),
10208                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
10209  MemOps.push_back(Store);
10210
10211  // Store ptr to overflow_arg_area
10212  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10213                    FIN, DAG.getIntPtrConstant(4));
10214  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
10215                                    getPointerTy());
10216  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
10217                       MachinePointerInfo(SV, 8),
10218                       false, false, 0);
10219  MemOps.push_back(Store);
10220
10221  // Store ptr to reg_save_area.
10222  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10223                    FIN, DAG.getIntPtrConstant(8));
10224  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
10225                                    getPointerTy());
10226  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
10227                       MachinePointerInfo(SV, 16), false, false, 0);
10228  MemOps.push_back(Store);
10229  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
10230                     &MemOps[0], MemOps.size());
10231}
10232
10233SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10234  assert(Subtarget->is64Bit() &&
10235         "LowerVAARG only handles 64-bit va_arg!");
10236  assert((Subtarget->isTargetLinux() ||
10237          Subtarget->isTargetDarwin()) &&
10238          "Unhandled target in LowerVAARG");
10239  assert(Op.getNode()->getNumOperands() == 4);
10240  SDValue Chain = Op.getOperand(0);
10241  SDValue SrcPtr = Op.getOperand(1);
10242  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10243  unsigned Align = Op.getConstantOperandVal(3);
10244  SDLoc dl(Op);
10245
10246  EVT ArgVT = Op.getNode()->getValueType(0);
10247  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
10248  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
10249  uint8_t ArgMode;
10250
10251  // Decide which area this value should be read from.
10252  // TODO: Implement the AMD64 ABI in its entirety. This simple
10253  // selection mechanism works only for the basic types.
10254  if (ArgVT == MVT::f80) {
10255    llvm_unreachable("va_arg for f80 not yet implemented");
10256  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
10257    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
10258  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
10259    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
10260  } else {
10261    llvm_unreachable("Unhandled argument type in LowerVAARG");
10262  }
10263
10264  if (ArgMode == 2) {
10265    // Sanity Check: Make sure using fp_offset makes sense.
10266    assert(!getTargetMachine().Options.UseSoftFloat &&
10267           !(DAG.getMachineFunction()
10268                .getFunction()->getAttributes()
10269                .hasAttribute(AttributeSet::FunctionIndex,
10270                              Attribute::NoImplicitFloat)) &&
10271           Subtarget->hasSSE1());
10272  }
10273
10274  // Insert VAARG_64 node into the DAG
10275  // VAARG_64 returns two values: Variable Argument Address, Chain
10276  SmallVector<SDValue, 11> InstOps;
10277  InstOps.push_back(Chain);
10278  InstOps.push_back(SrcPtr);
10279  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
10280  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
10281  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
10282  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
10283  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
10284                                          VTs, &InstOps[0], InstOps.size(),
10285                                          MVT::i64,
10286                                          MachinePointerInfo(SV),
10287                                          /*Align=*/0,
10288                                          /*Volatile=*/false,
10289                                          /*ReadMem=*/true,
10290                                          /*WriteMem=*/true);
10291  Chain = VAARG.getValue(1);
10292
10293  // Load the next argument and return it
10294  return DAG.getLoad(ArgVT, dl,
10295                     Chain,
10296                     VAARG,
10297                     MachinePointerInfo(),
10298                     false, false, false, 0);
10299}
10300
10301static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
10302                           SelectionDAG &DAG) {
10303  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
10304  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
10305  SDValue Chain = Op.getOperand(0);
10306  SDValue DstPtr = Op.getOperand(1);
10307  SDValue SrcPtr = Op.getOperand(2);
10308  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10309  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10310  SDLoc DL(Op);
10311
10312  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
10313                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
10314                       false,
10315                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
10316}
10317
10318// getTargetVShiftNode - Handle vector element shifts where the shift amount
10319// may or may not be a constant. Takes immediate version of shift as input.
10320static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
10321                                   SDValue SrcOp, SDValue ShAmt,
10322                                   SelectionDAG &DAG) {
10323  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
10324
10325  if (isa<ConstantSDNode>(ShAmt)) {
10326    // Constant may be a TargetConstant. Use a regular constant.
10327    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
10328    switch (Opc) {
10329      default: llvm_unreachable("Unknown target vector shift node");
10330      case X86ISD::VSHLI:
10331      case X86ISD::VSRLI:
10332      case X86ISD::VSRAI:
10333        return DAG.getNode(Opc, dl, VT, SrcOp,
10334                           DAG.getConstant(ShiftAmt, MVT::i32));
10335    }
10336  }
10337
10338  // Change opcode to non-immediate version
10339  switch (Opc) {
10340    default: llvm_unreachable("Unknown target vector shift node");
10341    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
10342    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
10343    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
10344  }
10345
10346  // Need to build a vector containing shift amount
10347  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
10348  SDValue ShOps[4];
10349  ShOps[0] = ShAmt;
10350  ShOps[1] = DAG.getConstant(0, MVT::i32);
10351  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
10352  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
10353
10354  // The return type has to be a 128-bit type with the same element
10355  // type as the input type.
10356  MVT EltVT = VT.getVectorElementType().getSimpleVT();
10357  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
10358
10359  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
10360  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
10361}
10362
10363static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
10364  SDLoc dl(Op);
10365  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10366  switch (IntNo) {
10367  default: return SDValue();    // Don't custom lower most intrinsics.
10368  // Comparison intrinsics.
10369  case Intrinsic::x86_sse_comieq_ss:
10370  case Intrinsic::x86_sse_comilt_ss:
10371  case Intrinsic::x86_sse_comile_ss:
10372  case Intrinsic::x86_sse_comigt_ss:
10373  case Intrinsic::x86_sse_comige_ss:
10374  case Intrinsic::x86_sse_comineq_ss:
10375  case Intrinsic::x86_sse_ucomieq_ss:
10376  case Intrinsic::x86_sse_ucomilt_ss:
10377  case Intrinsic::x86_sse_ucomile_ss:
10378  case Intrinsic::x86_sse_ucomigt_ss:
10379  case Intrinsic::x86_sse_ucomige_ss:
10380  case Intrinsic::x86_sse_ucomineq_ss:
10381  case Intrinsic::x86_sse2_comieq_sd:
10382  case Intrinsic::x86_sse2_comilt_sd:
10383  case Intrinsic::x86_sse2_comile_sd:
10384  case Intrinsic::x86_sse2_comigt_sd:
10385  case Intrinsic::x86_sse2_comige_sd:
10386  case Intrinsic::x86_sse2_comineq_sd:
10387  case Intrinsic::x86_sse2_ucomieq_sd:
10388  case Intrinsic::x86_sse2_ucomilt_sd:
10389  case Intrinsic::x86_sse2_ucomile_sd:
10390  case Intrinsic::x86_sse2_ucomigt_sd:
10391  case Intrinsic::x86_sse2_ucomige_sd:
10392  case Intrinsic::x86_sse2_ucomineq_sd: {
10393    unsigned Opc;
10394    ISD::CondCode CC;
10395    switch (IntNo) {
10396    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10397    case Intrinsic::x86_sse_comieq_ss:
10398    case Intrinsic::x86_sse2_comieq_sd:
10399      Opc = X86ISD::COMI;
10400      CC = ISD::SETEQ;
10401      break;
10402    case Intrinsic::x86_sse_comilt_ss:
10403    case Intrinsic::x86_sse2_comilt_sd:
10404      Opc = X86ISD::COMI;
10405      CC = ISD::SETLT;
10406      break;
10407    case Intrinsic::x86_sse_comile_ss:
10408    case Intrinsic::x86_sse2_comile_sd:
10409      Opc = X86ISD::COMI;
10410      CC = ISD::SETLE;
10411      break;
10412    case Intrinsic::x86_sse_comigt_ss:
10413    case Intrinsic::x86_sse2_comigt_sd:
10414      Opc = X86ISD::COMI;
10415      CC = ISD::SETGT;
10416      break;
10417    case Intrinsic::x86_sse_comige_ss:
10418    case Intrinsic::x86_sse2_comige_sd:
10419      Opc = X86ISD::COMI;
10420      CC = ISD::SETGE;
10421      break;
10422    case Intrinsic::x86_sse_comineq_ss:
10423    case Intrinsic::x86_sse2_comineq_sd:
10424      Opc = X86ISD::COMI;
10425      CC = ISD::SETNE;
10426      break;
10427    case Intrinsic::x86_sse_ucomieq_ss:
10428    case Intrinsic::x86_sse2_ucomieq_sd:
10429      Opc = X86ISD::UCOMI;
10430      CC = ISD::SETEQ;
10431      break;
10432    case Intrinsic::x86_sse_ucomilt_ss:
10433    case Intrinsic::x86_sse2_ucomilt_sd:
10434      Opc = X86ISD::UCOMI;
10435      CC = ISD::SETLT;
10436      break;
10437    case Intrinsic::x86_sse_ucomile_ss:
10438    case Intrinsic::x86_sse2_ucomile_sd:
10439      Opc = X86ISD::UCOMI;
10440      CC = ISD::SETLE;
10441      break;
10442    case Intrinsic::x86_sse_ucomigt_ss:
10443    case Intrinsic::x86_sse2_ucomigt_sd:
10444      Opc = X86ISD::UCOMI;
10445      CC = ISD::SETGT;
10446      break;
10447    case Intrinsic::x86_sse_ucomige_ss:
10448    case Intrinsic::x86_sse2_ucomige_sd:
10449      Opc = X86ISD::UCOMI;
10450      CC = ISD::SETGE;
10451      break;
10452    case Intrinsic::x86_sse_ucomineq_ss:
10453    case Intrinsic::x86_sse2_ucomineq_sd:
10454      Opc = X86ISD::UCOMI;
10455      CC = ISD::SETNE;
10456      break;
10457    }
10458
10459    SDValue LHS = Op.getOperand(1);
10460    SDValue RHS = Op.getOperand(2);
10461    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
10462    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
10463    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
10464    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10465                                DAG.getConstant(X86CC, MVT::i8), Cond);
10466    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10467  }
10468
10469  // Arithmetic intrinsics.
10470  case Intrinsic::x86_sse2_pmulu_dq:
10471  case Intrinsic::x86_avx2_pmulu_dq:
10472    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
10473                       Op.getOperand(1), Op.getOperand(2));
10474
10475  // SSE2/AVX2 sub with unsigned saturation intrinsics
10476  case Intrinsic::x86_sse2_psubus_b:
10477  case Intrinsic::x86_sse2_psubus_w:
10478  case Intrinsic::x86_avx2_psubus_b:
10479  case Intrinsic::x86_avx2_psubus_w:
10480    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
10481                       Op.getOperand(1), Op.getOperand(2));
10482
10483  // SSE3/AVX horizontal add/sub intrinsics
10484  case Intrinsic::x86_sse3_hadd_ps:
10485  case Intrinsic::x86_sse3_hadd_pd:
10486  case Intrinsic::x86_avx_hadd_ps_256:
10487  case Intrinsic::x86_avx_hadd_pd_256:
10488  case Intrinsic::x86_sse3_hsub_ps:
10489  case Intrinsic::x86_sse3_hsub_pd:
10490  case Intrinsic::x86_avx_hsub_ps_256:
10491  case Intrinsic::x86_avx_hsub_pd_256:
10492  case Intrinsic::x86_ssse3_phadd_w_128:
10493  case Intrinsic::x86_ssse3_phadd_d_128:
10494  case Intrinsic::x86_avx2_phadd_w:
10495  case Intrinsic::x86_avx2_phadd_d:
10496  case Intrinsic::x86_ssse3_phsub_w_128:
10497  case Intrinsic::x86_ssse3_phsub_d_128:
10498  case Intrinsic::x86_avx2_phsub_w:
10499  case Intrinsic::x86_avx2_phsub_d: {
10500    unsigned Opcode;
10501    switch (IntNo) {
10502    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10503    case Intrinsic::x86_sse3_hadd_ps:
10504    case Intrinsic::x86_sse3_hadd_pd:
10505    case Intrinsic::x86_avx_hadd_ps_256:
10506    case Intrinsic::x86_avx_hadd_pd_256:
10507      Opcode = X86ISD::FHADD;
10508      break;
10509    case Intrinsic::x86_sse3_hsub_ps:
10510    case Intrinsic::x86_sse3_hsub_pd:
10511    case Intrinsic::x86_avx_hsub_ps_256:
10512    case Intrinsic::x86_avx_hsub_pd_256:
10513      Opcode = X86ISD::FHSUB;
10514      break;
10515    case Intrinsic::x86_ssse3_phadd_w_128:
10516    case Intrinsic::x86_ssse3_phadd_d_128:
10517    case Intrinsic::x86_avx2_phadd_w:
10518    case Intrinsic::x86_avx2_phadd_d:
10519      Opcode = X86ISD::HADD;
10520      break;
10521    case Intrinsic::x86_ssse3_phsub_w_128:
10522    case Intrinsic::x86_ssse3_phsub_d_128:
10523    case Intrinsic::x86_avx2_phsub_w:
10524    case Intrinsic::x86_avx2_phsub_d:
10525      Opcode = X86ISD::HSUB;
10526      break;
10527    }
10528    return DAG.getNode(Opcode, dl, Op.getValueType(),
10529                       Op.getOperand(1), Op.getOperand(2));
10530  }
10531
10532  // SSE2/SSE41/AVX2 integer max/min intrinsics.
10533  case Intrinsic::x86_sse2_pmaxu_b:
10534  case Intrinsic::x86_sse41_pmaxuw:
10535  case Intrinsic::x86_sse41_pmaxud:
10536  case Intrinsic::x86_avx2_pmaxu_b:
10537  case Intrinsic::x86_avx2_pmaxu_w:
10538  case Intrinsic::x86_avx2_pmaxu_d:
10539  case Intrinsic::x86_sse2_pminu_b:
10540  case Intrinsic::x86_sse41_pminuw:
10541  case Intrinsic::x86_sse41_pminud:
10542  case Intrinsic::x86_avx2_pminu_b:
10543  case Intrinsic::x86_avx2_pminu_w:
10544  case Intrinsic::x86_avx2_pminu_d:
10545  case Intrinsic::x86_sse41_pmaxsb:
10546  case Intrinsic::x86_sse2_pmaxs_w:
10547  case Intrinsic::x86_sse41_pmaxsd:
10548  case Intrinsic::x86_avx2_pmaxs_b:
10549  case Intrinsic::x86_avx2_pmaxs_w:
10550  case Intrinsic::x86_avx2_pmaxs_d:
10551  case Intrinsic::x86_sse41_pminsb:
10552  case Intrinsic::x86_sse2_pmins_w:
10553  case Intrinsic::x86_sse41_pminsd:
10554  case Intrinsic::x86_avx2_pmins_b:
10555  case Intrinsic::x86_avx2_pmins_w:
10556  case Intrinsic::x86_avx2_pmins_d: {
10557    unsigned Opcode;
10558    switch (IntNo) {
10559    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10560    case Intrinsic::x86_sse2_pmaxu_b:
10561    case Intrinsic::x86_sse41_pmaxuw:
10562    case Intrinsic::x86_sse41_pmaxud:
10563    case Intrinsic::x86_avx2_pmaxu_b:
10564    case Intrinsic::x86_avx2_pmaxu_w:
10565    case Intrinsic::x86_avx2_pmaxu_d:
10566      Opcode = X86ISD::UMAX;
10567      break;
10568    case Intrinsic::x86_sse2_pminu_b:
10569    case Intrinsic::x86_sse41_pminuw:
10570    case Intrinsic::x86_sse41_pminud:
10571    case Intrinsic::x86_avx2_pminu_b:
10572    case Intrinsic::x86_avx2_pminu_w:
10573    case Intrinsic::x86_avx2_pminu_d:
10574      Opcode = X86ISD::UMIN;
10575      break;
10576    case Intrinsic::x86_sse41_pmaxsb:
10577    case Intrinsic::x86_sse2_pmaxs_w:
10578    case Intrinsic::x86_sse41_pmaxsd:
10579    case Intrinsic::x86_avx2_pmaxs_b:
10580    case Intrinsic::x86_avx2_pmaxs_w:
10581    case Intrinsic::x86_avx2_pmaxs_d:
10582      Opcode = X86ISD::SMAX;
10583      break;
10584    case Intrinsic::x86_sse41_pminsb:
10585    case Intrinsic::x86_sse2_pmins_w:
10586    case Intrinsic::x86_sse41_pminsd:
10587    case Intrinsic::x86_avx2_pmins_b:
10588    case Intrinsic::x86_avx2_pmins_w:
10589    case Intrinsic::x86_avx2_pmins_d:
10590      Opcode = X86ISD::SMIN;
10591      break;
10592    }
10593    return DAG.getNode(Opcode, dl, Op.getValueType(),
10594                       Op.getOperand(1), Op.getOperand(2));
10595  }
10596
10597  // SSE/SSE2/AVX floating point max/min intrinsics.
10598  case Intrinsic::x86_sse_max_ps:
10599  case Intrinsic::x86_sse2_max_pd:
10600  case Intrinsic::x86_avx_max_ps_256:
10601  case Intrinsic::x86_avx_max_pd_256:
10602  case Intrinsic::x86_sse_min_ps:
10603  case Intrinsic::x86_sse2_min_pd:
10604  case Intrinsic::x86_avx_min_ps_256:
10605  case Intrinsic::x86_avx_min_pd_256: {
10606    unsigned Opcode;
10607    switch (IntNo) {
10608    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10609    case Intrinsic::x86_sse_max_ps:
10610    case Intrinsic::x86_sse2_max_pd:
10611    case Intrinsic::x86_avx_max_ps_256:
10612    case Intrinsic::x86_avx_max_pd_256:
10613      Opcode = X86ISD::FMAX;
10614      break;
10615    case Intrinsic::x86_sse_min_ps:
10616    case Intrinsic::x86_sse2_min_pd:
10617    case Intrinsic::x86_avx_min_ps_256:
10618    case Intrinsic::x86_avx_min_pd_256:
10619      Opcode = X86ISD::FMIN;
10620      break;
10621    }
10622    return DAG.getNode(Opcode, dl, Op.getValueType(),
10623                       Op.getOperand(1), Op.getOperand(2));
10624  }
10625
10626  // AVX2 variable shift intrinsics
10627  case Intrinsic::x86_avx2_psllv_d:
10628  case Intrinsic::x86_avx2_psllv_q:
10629  case Intrinsic::x86_avx2_psllv_d_256:
10630  case Intrinsic::x86_avx2_psllv_q_256:
10631  case Intrinsic::x86_avx2_psrlv_d:
10632  case Intrinsic::x86_avx2_psrlv_q:
10633  case Intrinsic::x86_avx2_psrlv_d_256:
10634  case Intrinsic::x86_avx2_psrlv_q_256:
10635  case Intrinsic::x86_avx2_psrav_d:
10636  case Intrinsic::x86_avx2_psrav_d_256: {
10637    unsigned Opcode;
10638    switch (IntNo) {
10639    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10640    case Intrinsic::x86_avx2_psllv_d:
10641    case Intrinsic::x86_avx2_psllv_q:
10642    case Intrinsic::x86_avx2_psllv_d_256:
10643    case Intrinsic::x86_avx2_psllv_q_256:
10644      Opcode = ISD::SHL;
10645      break;
10646    case Intrinsic::x86_avx2_psrlv_d:
10647    case Intrinsic::x86_avx2_psrlv_q:
10648    case Intrinsic::x86_avx2_psrlv_d_256:
10649    case Intrinsic::x86_avx2_psrlv_q_256:
10650      Opcode = ISD::SRL;
10651      break;
10652    case Intrinsic::x86_avx2_psrav_d:
10653    case Intrinsic::x86_avx2_psrav_d_256:
10654      Opcode = ISD::SRA;
10655      break;
10656    }
10657    return DAG.getNode(Opcode, dl, Op.getValueType(),
10658                       Op.getOperand(1), Op.getOperand(2));
10659  }
10660
10661  case Intrinsic::x86_ssse3_pshuf_b_128:
10662  case Intrinsic::x86_avx2_pshuf_b:
10663    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
10664                       Op.getOperand(1), Op.getOperand(2));
10665
10666  case Intrinsic::x86_ssse3_psign_b_128:
10667  case Intrinsic::x86_ssse3_psign_w_128:
10668  case Intrinsic::x86_ssse3_psign_d_128:
10669  case Intrinsic::x86_avx2_psign_b:
10670  case Intrinsic::x86_avx2_psign_w:
10671  case Intrinsic::x86_avx2_psign_d:
10672    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
10673                       Op.getOperand(1), Op.getOperand(2));
10674
10675  case Intrinsic::x86_sse41_insertps:
10676    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
10677                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
10678
10679  case Intrinsic::x86_avx_vperm2f128_ps_256:
10680  case Intrinsic::x86_avx_vperm2f128_pd_256:
10681  case Intrinsic::x86_avx_vperm2f128_si_256:
10682  case Intrinsic::x86_avx2_vperm2i128:
10683    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
10684                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
10685
10686  case Intrinsic::x86_avx2_permd:
10687  case Intrinsic::x86_avx2_permps:
10688    // Operands intentionally swapped. Mask is last operand to intrinsic,
10689    // but second operand for node/intruction.
10690    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
10691                       Op.getOperand(2), Op.getOperand(1));
10692
10693  case Intrinsic::x86_sse_sqrt_ps:
10694  case Intrinsic::x86_sse2_sqrt_pd:
10695  case Intrinsic::x86_avx_sqrt_ps_256:
10696  case Intrinsic::x86_avx_sqrt_pd_256:
10697    return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
10698
10699  // ptest and testp intrinsics. The intrinsic these come from are designed to
10700  // return an integer value, not just an instruction so lower it to the ptest
10701  // or testp pattern and a setcc for the result.
10702  case Intrinsic::x86_sse41_ptestz:
10703  case Intrinsic::x86_sse41_ptestc:
10704  case Intrinsic::x86_sse41_ptestnzc:
10705  case Intrinsic::x86_avx_ptestz_256:
10706  case Intrinsic::x86_avx_ptestc_256:
10707  case Intrinsic::x86_avx_ptestnzc_256:
10708  case Intrinsic::x86_avx_vtestz_ps:
10709  case Intrinsic::x86_avx_vtestc_ps:
10710  case Intrinsic::x86_avx_vtestnzc_ps:
10711  case Intrinsic::x86_avx_vtestz_pd:
10712  case Intrinsic::x86_avx_vtestc_pd:
10713  case Intrinsic::x86_avx_vtestnzc_pd:
10714  case Intrinsic::x86_avx_vtestz_ps_256:
10715  case Intrinsic::x86_avx_vtestc_ps_256:
10716  case Intrinsic::x86_avx_vtestnzc_ps_256:
10717  case Intrinsic::x86_avx_vtestz_pd_256:
10718  case Intrinsic::x86_avx_vtestc_pd_256:
10719  case Intrinsic::x86_avx_vtestnzc_pd_256: {
10720    bool IsTestPacked = false;
10721    unsigned X86CC;
10722    switch (IntNo) {
10723    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
10724    case Intrinsic::x86_avx_vtestz_ps:
10725    case Intrinsic::x86_avx_vtestz_pd:
10726    case Intrinsic::x86_avx_vtestz_ps_256:
10727    case Intrinsic::x86_avx_vtestz_pd_256:
10728      IsTestPacked = true; // Fallthrough
10729    case Intrinsic::x86_sse41_ptestz:
10730    case Intrinsic::x86_avx_ptestz_256:
10731      // ZF = 1
10732      X86CC = X86::COND_E;
10733      break;
10734    case Intrinsic::x86_avx_vtestc_ps:
10735    case Intrinsic::x86_avx_vtestc_pd:
10736    case Intrinsic::x86_avx_vtestc_ps_256:
10737    case Intrinsic::x86_avx_vtestc_pd_256:
10738      IsTestPacked = true; // Fallthrough
10739    case Intrinsic::x86_sse41_ptestc:
10740    case Intrinsic::x86_avx_ptestc_256:
10741      // CF = 1
10742      X86CC = X86::COND_B;
10743      break;
10744    case Intrinsic::x86_avx_vtestnzc_ps:
10745    case Intrinsic::x86_avx_vtestnzc_pd:
10746    case Intrinsic::x86_avx_vtestnzc_ps_256:
10747    case Intrinsic::x86_avx_vtestnzc_pd_256:
10748      IsTestPacked = true; // Fallthrough
10749    case Intrinsic::x86_sse41_ptestnzc:
10750    case Intrinsic::x86_avx_ptestnzc_256:
10751      // ZF and CF = 0
10752      X86CC = X86::COND_A;
10753      break;
10754    }
10755
10756    SDValue LHS = Op.getOperand(1);
10757    SDValue RHS = Op.getOperand(2);
10758    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
10759    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
10760    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
10761    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
10762    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10763  }
10764
10765  // SSE/AVX shift intrinsics
10766  case Intrinsic::x86_sse2_psll_w:
10767  case Intrinsic::x86_sse2_psll_d:
10768  case Intrinsic::x86_sse2_psll_q:
10769  case Intrinsic::x86_avx2_psll_w:
10770  case Intrinsic::x86_avx2_psll_d:
10771  case Intrinsic::x86_avx2_psll_q:
10772  case Intrinsic::x86_sse2_psrl_w:
10773  case Intrinsic::x86_sse2_psrl_d:
10774  case Intrinsic::x86_sse2_psrl_q:
10775  case Intrinsic::x86_avx2_psrl_w:
10776  case Intrinsic::x86_avx2_psrl_d:
10777  case Intrinsic::x86_avx2_psrl_q:
10778  case Intrinsic::x86_sse2_psra_w:
10779  case Intrinsic::x86_sse2_psra_d:
10780  case Intrinsic::x86_avx2_psra_w:
10781  case Intrinsic::x86_avx2_psra_d: {
10782    unsigned Opcode;
10783    switch (IntNo) {
10784    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10785    case Intrinsic::x86_sse2_psll_w:
10786    case Intrinsic::x86_sse2_psll_d:
10787    case Intrinsic::x86_sse2_psll_q:
10788    case Intrinsic::x86_avx2_psll_w:
10789    case Intrinsic::x86_avx2_psll_d:
10790    case Intrinsic::x86_avx2_psll_q:
10791      Opcode = X86ISD::VSHL;
10792      break;
10793    case Intrinsic::x86_sse2_psrl_w:
10794    case Intrinsic::x86_sse2_psrl_d:
10795    case Intrinsic::x86_sse2_psrl_q:
10796    case Intrinsic::x86_avx2_psrl_w:
10797    case Intrinsic::x86_avx2_psrl_d:
10798    case Intrinsic::x86_avx2_psrl_q:
10799      Opcode = X86ISD::VSRL;
10800      break;
10801    case Intrinsic::x86_sse2_psra_w:
10802    case Intrinsic::x86_sse2_psra_d:
10803    case Intrinsic::x86_avx2_psra_w:
10804    case Intrinsic::x86_avx2_psra_d:
10805      Opcode = X86ISD::VSRA;
10806      break;
10807    }
10808    return DAG.getNode(Opcode, dl, Op.getValueType(),
10809                       Op.getOperand(1), Op.getOperand(2));
10810  }
10811
10812  // SSE/AVX immediate shift intrinsics
10813  case Intrinsic::x86_sse2_pslli_w:
10814  case Intrinsic::x86_sse2_pslli_d:
10815  case Intrinsic::x86_sse2_pslli_q:
10816  case Intrinsic::x86_avx2_pslli_w:
10817  case Intrinsic::x86_avx2_pslli_d:
10818  case Intrinsic::x86_avx2_pslli_q:
10819  case Intrinsic::x86_sse2_psrli_w:
10820  case Intrinsic::x86_sse2_psrli_d:
10821  case Intrinsic::x86_sse2_psrli_q:
10822  case Intrinsic::x86_avx2_psrli_w:
10823  case Intrinsic::x86_avx2_psrli_d:
10824  case Intrinsic::x86_avx2_psrli_q:
10825  case Intrinsic::x86_sse2_psrai_w:
10826  case Intrinsic::x86_sse2_psrai_d:
10827  case Intrinsic::x86_avx2_psrai_w:
10828  case Intrinsic::x86_avx2_psrai_d: {
10829    unsigned Opcode;
10830    switch (IntNo) {
10831    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10832    case Intrinsic::x86_sse2_pslli_w:
10833    case Intrinsic::x86_sse2_pslli_d:
10834    case Intrinsic::x86_sse2_pslli_q:
10835    case Intrinsic::x86_avx2_pslli_w:
10836    case Intrinsic::x86_avx2_pslli_d:
10837    case Intrinsic::x86_avx2_pslli_q:
10838      Opcode = X86ISD::VSHLI;
10839      break;
10840    case Intrinsic::x86_sse2_psrli_w:
10841    case Intrinsic::x86_sse2_psrli_d:
10842    case Intrinsic::x86_sse2_psrli_q:
10843    case Intrinsic::x86_avx2_psrli_w:
10844    case Intrinsic::x86_avx2_psrli_d:
10845    case Intrinsic::x86_avx2_psrli_q:
10846      Opcode = X86ISD::VSRLI;
10847      break;
10848    case Intrinsic::x86_sse2_psrai_w:
10849    case Intrinsic::x86_sse2_psrai_d:
10850    case Intrinsic::x86_avx2_psrai_w:
10851    case Intrinsic::x86_avx2_psrai_d:
10852      Opcode = X86ISD::VSRAI;
10853      break;
10854    }
10855    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
10856                               Op.getOperand(1), Op.getOperand(2), DAG);
10857  }
10858
10859  case Intrinsic::x86_sse42_pcmpistria128:
10860  case Intrinsic::x86_sse42_pcmpestria128:
10861  case Intrinsic::x86_sse42_pcmpistric128:
10862  case Intrinsic::x86_sse42_pcmpestric128:
10863  case Intrinsic::x86_sse42_pcmpistrio128:
10864  case Intrinsic::x86_sse42_pcmpestrio128:
10865  case Intrinsic::x86_sse42_pcmpistris128:
10866  case Intrinsic::x86_sse42_pcmpestris128:
10867  case Intrinsic::x86_sse42_pcmpistriz128:
10868  case Intrinsic::x86_sse42_pcmpestriz128: {
10869    unsigned Opcode;
10870    unsigned X86CC;
10871    switch (IntNo) {
10872    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10873    case Intrinsic::x86_sse42_pcmpistria128:
10874      Opcode = X86ISD::PCMPISTRI;
10875      X86CC = X86::COND_A;
10876      break;
10877    case Intrinsic::x86_sse42_pcmpestria128:
10878      Opcode = X86ISD::PCMPESTRI;
10879      X86CC = X86::COND_A;
10880      break;
10881    case Intrinsic::x86_sse42_pcmpistric128:
10882      Opcode = X86ISD::PCMPISTRI;
10883      X86CC = X86::COND_B;
10884      break;
10885    case Intrinsic::x86_sse42_pcmpestric128:
10886      Opcode = X86ISD::PCMPESTRI;
10887      X86CC = X86::COND_B;
10888      break;
10889    case Intrinsic::x86_sse42_pcmpistrio128:
10890      Opcode = X86ISD::PCMPISTRI;
10891      X86CC = X86::COND_O;
10892      break;
10893    case Intrinsic::x86_sse42_pcmpestrio128:
10894      Opcode = X86ISD::PCMPESTRI;
10895      X86CC = X86::COND_O;
10896      break;
10897    case Intrinsic::x86_sse42_pcmpistris128:
10898      Opcode = X86ISD::PCMPISTRI;
10899      X86CC = X86::COND_S;
10900      break;
10901    case Intrinsic::x86_sse42_pcmpestris128:
10902      Opcode = X86ISD::PCMPESTRI;
10903      X86CC = X86::COND_S;
10904      break;
10905    case Intrinsic::x86_sse42_pcmpistriz128:
10906      Opcode = X86ISD::PCMPISTRI;
10907      X86CC = X86::COND_E;
10908      break;
10909    case Intrinsic::x86_sse42_pcmpestriz128:
10910      Opcode = X86ISD::PCMPESTRI;
10911      X86CC = X86::COND_E;
10912      break;
10913    }
10914    SmallVector<SDValue, 5> NewOps;
10915    NewOps.append(Op->op_begin()+1, Op->op_end());
10916    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10917    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10918    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10919                                DAG.getConstant(X86CC, MVT::i8),
10920                                SDValue(PCMP.getNode(), 1));
10921    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10922  }
10923
10924  case Intrinsic::x86_sse42_pcmpistri128:
10925  case Intrinsic::x86_sse42_pcmpestri128: {
10926    unsigned Opcode;
10927    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
10928      Opcode = X86ISD::PCMPISTRI;
10929    else
10930      Opcode = X86ISD::PCMPESTRI;
10931
10932    SmallVector<SDValue, 5> NewOps;
10933    NewOps.append(Op->op_begin()+1, Op->op_end());
10934    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10935    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10936  }
10937  case Intrinsic::x86_fma_vfmadd_ps:
10938  case Intrinsic::x86_fma_vfmadd_pd:
10939  case Intrinsic::x86_fma_vfmsub_ps:
10940  case Intrinsic::x86_fma_vfmsub_pd:
10941  case Intrinsic::x86_fma_vfnmadd_ps:
10942  case Intrinsic::x86_fma_vfnmadd_pd:
10943  case Intrinsic::x86_fma_vfnmsub_ps:
10944  case Intrinsic::x86_fma_vfnmsub_pd:
10945  case Intrinsic::x86_fma_vfmaddsub_ps:
10946  case Intrinsic::x86_fma_vfmaddsub_pd:
10947  case Intrinsic::x86_fma_vfmsubadd_ps:
10948  case Intrinsic::x86_fma_vfmsubadd_pd:
10949  case Intrinsic::x86_fma_vfmadd_ps_256:
10950  case Intrinsic::x86_fma_vfmadd_pd_256:
10951  case Intrinsic::x86_fma_vfmsub_ps_256:
10952  case Intrinsic::x86_fma_vfmsub_pd_256:
10953  case Intrinsic::x86_fma_vfnmadd_ps_256:
10954  case Intrinsic::x86_fma_vfnmadd_pd_256:
10955  case Intrinsic::x86_fma_vfnmsub_ps_256:
10956  case Intrinsic::x86_fma_vfnmsub_pd_256:
10957  case Intrinsic::x86_fma_vfmaddsub_ps_256:
10958  case Intrinsic::x86_fma_vfmaddsub_pd_256:
10959  case Intrinsic::x86_fma_vfmsubadd_ps_256:
10960  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
10961    unsigned Opc;
10962    switch (IntNo) {
10963    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10964    case Intrinsic::x86_fma_vfmadd_ps:
10965    case Intrinsic::x86_fma_vfmadd_pd:
10966    case Intrinsic::x86_fma_vfmadd_ps_256:
10967    case Intrinsic::x86_fma_vfmadd_pd_256:
10968      Opc = X86ISD::FMADD;
10969      break;
10970    case Intrinsic::x86_fma_vfmsub_ps:
10971    case Intrinsic::x86_fma_vfmsub_pd:
10972    case Intrinsic::x86_fma_vfmsub_ps_256:
10973    case Intrinsic::x86_fma_vfmsub_pd_256:
10974      Opc = X86ISD::FMSUB;
10975      break;
10976    case Intrinsic::x86_fma_vfnmadd_ps:
10977    case Intrinsic::x86_fma_vfnmadd_pd:
10978    case Intrinsic::x86_fma_vfnmadd_ps_256:
10979    case Intrinsic::x86_fma_vfnmadd_pd_256:
10980      Opc = X86ISD::FNMADD;
10981      break;
10982    case Intrinsic::x86_fma_vfnmsub_ps:
10983    case Intrinsic::x86_fma_vfnmsub_pd:
10984    case Intrinsic::x86_fma_vfnmsub_ps_256:
10985    case Intrinsic::x86_fma_vfnmsub_pd_256:
10986      Opc = X86ISD::FNMSUB;
10987      break;
10988    case Intrinsic::x86_fma_vfmaddsub_ps:
10989    case Intrinsic::x86_fma_vfmaddsub_pd:
10990    case Intrinsic::x86_fma_vfmaddsub_ps_256:
10991    case Intrinsic::x86_fma_vfmaddsub_pd_256:
10992      Opc = X86ISD::FMADDSUB;
10993      break;
10994    case Intrinsic::x86_fma_vfmsubadd_ps:
10995    case Intrinsic::x86_fma_vfmsubadd_pd:
10996    case Intrinsic::x86_fma_vfmsubadd_ps_256:
10997    case Intrinsic::x86_fma_vfmsubadd_pd_256:
10998      Opc = X86ISD::FMSUBADD;
10999      break;
11000    }
11001
11002    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
11003                       Op.getOperand(2), Op.getOperand(3));
11004  }
11005  }
11006}
11007
11008static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
11009  SDLoc dl(Op);
11010  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11011  switch (IntNo) {
11012  default: return SDValue();    // Don't custom lower most intrinsics.
11013
11014  // RDRAND/RDSEED intrinsics.
11015  case Intrinsic::x86_rdrand_16:
11016  case Intrinsic::x86_rdrand_32:
11017  case Intrinsic::x86_rdrand_64:
11018  case Intrinsic::x86_rdseed_16:
11019  case Intrinsic::x86_rdseed_32:
11020  case Intrinsic::x86_rdseed_64: {
11021    unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
11022                       IntNo == Intrinsic::x86_rdseed_32 ||
11023                       IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
11024                                                            X86ISD::RDRAND;
11025    // Emit the node with the right value type.
11026    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
11027    SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
11028
11029    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
11030    // Otherwise return the value from Rand, which is always 0, casted to i32.
11031    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
11032                      DAG.getConstant(1, Op->getValueType(1)),
11033                      DAG.getConstant(X86::COND_B, MVT::i32),
11034                      SDValue(Result.getNode(), 1) };
11035    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
11036                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
11037                                  Ops, array_lengthof(Ops));
11038
11039    // Return { result, isValid, chain }.
11040    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
11041                       SDValue(Result.getNode(), 2));
11042  }
11043
11044  // XTEST intrinsics.
11045  case Intrinsic::x86_xtest: {
11046    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
11047    SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
11048    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11049                                DAG.getConstant(X86::COND_NE, MVT::i8),
11050                                InTrans);
11051    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
11052    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
11053                       Ret, SDValue(InTrans.getNode(), 1));
11054  }
11055  }
11056}
11057
11058SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
11059                                           SelectionDAG &DAG) const {
11060  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
11061  MFI->setReturnAddressIsTaken(true);
11062
11063  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11064  SDLoc dl(Op);
11065  EVT PtrVT = getPointerTy();
11066
11067  if (Depth > 0) {
11068    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11069    const X86RegisterInfo *RegInfo =
11070      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
11071    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
11072    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
11073                       DAG.getNode(ISD::ADD, dl, PtrVT,
11074                                   FrameAddr, Offset),
11075                       MachinePointerInfo(), false, false, false, 0);
11076  }
11077
11078  // Just load the return address.
11079  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
11080  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
11081                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
11082}
11083
11084SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
11085  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
11086  MFI->setFrameAddressIsTaken(true);
11087
11088  EVT VT = Op.getValueType();
11089  SDLoc dl(Op);  // FIXME probably not meaningful
11090  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11091  const X86RegisterInfo *RegInfo =
11092    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
11093  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
11094  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
11095          (FrameReg == X86::EBP && VT == MVT::i32)) &&
11096         "Invalid Frame Register!");
11097  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
11098  while (Depth--)
11099    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
11100                            MachinePointerInfo(),
11101                            false, false, false, 0);
11102  return FrameAddr;
11103}
11104
11105SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
11106                                                     SelectionDAG &DAG) const {
11107  const X86RegisterInfo *RegInfo =
11108    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
11109  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
11110}
11111
11112SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
11113  SDValue Chain     = Op.getOperand(0);
11114  SDValue Offset    = Op.getOperand(1);
11115  SDValue Handler   = Op.getOperand(2);
11116  SDLoc dl      (Op);
11117
11118  EVT PtrVT = getPointerTy();
11119  const X86RegisterInfo *RegInfo =
11120    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
11121  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
11122  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
11123          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
11124         "Invalid Frame Register!");
11125  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
11126  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
11127
11128  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
11129                                 DAG.getIntPtrConstant(RegInfo->getSlotSize()));
11130  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
11131  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
11132                       false, false, 0);
11133  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
11134
11135  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
11136                     DAG.getRegister(StoreAddrReg, PtrVT));
11137}
11138
11139SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
11140                                               SelectionDAG &DAG) const {
11141  SDLoc DL(Op);
11142  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
11143                     DAG.getVTList(MVT::i32, MVT::Other),
11144                     Op.getOperand(0), Op.getOperand(1));
11145}
11146
11147SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
11148                                                SelectionDAG &DAG) const {
11149  SDLoc DL(Op);
11150  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
11151                     Op.getOperand(0), Op.getOperand(1));
11152}
11153
11154static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
11155  return Op.getOperand(0);
11156}
11157
11158SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
11159                                                SelectionDAG &DAG) const {
11160  SDValue Root = Op.getOperand(0);
11161  SDValue Trmp = Op.getOperand(1); // trampoline
11162  SDValue FPtr = Op.getOperand(2); // nested function
11163  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
11164  SDLoc dl (Op);
11165
11166  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11167  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
11168
11169  if (Subtarget->is64Bit()) {
11170    SDValue OutChains[6];
11171
11172    // Large code-model.
11173    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
11174    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
11175
11176    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
11177    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
11178
11179    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
11180
11181    // Load the pointer to the nested function into R11.
11182    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
11183    SDValue Addr = Trmp;
11184    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
11185                                Addr, MachinePointerInfo(TrmpAddr),
11186                                false, false, 0);
11187
11188    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
11189                       DAG.getConstant(2, MVT::i64));
11190    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
11191                                MachinePointerInfo(TrmpAddr, 2),
11192                                false, false, 2);
11193
11194    // Load the 'nest' parameter value into R10.
11195    // R10 is specified in X86CallingConv.td
11196    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
11197    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
11198                       DAG.getConstant(10, MVT::i64));
11199    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
11200                                Addr, MachinePointerInfo(TrmpAddr, 10),
11201                                false, false, 0);
11202
11203    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
11204                       DAG.getConstant(12, MVT::i64));
11205    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
11206                                MachinePointerInfo(TrmpAddr, 12),
11207                                false, false, 2);
11208
11209    // Jump to the nested function.
11210    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
11211    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
11212                       DAG.getConstant(20, MVT::i64));
11213    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
11214                                Addr, MachinePointerInfo(TrmpAddr, 20),
11215                                false, false, 0);
11216
11217    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
11218    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
11219                       DAG.getConstant(22, MVT::i64));
11220    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
11221                                MachinePointerInfo(TrmpAddr, 22),
11222                                false, false, 0);
11223
11224    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
11225  } else {
11226    const Function *Func =
11227      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
11228    CallingConv::ID CC = Func->getCallingConv();
11229    unsigned NestReg;
11230
11231    switch (CC) {
11232    default:
11233      llvm_unreachable("Unsupported calling convention");
11234    case CallingConv::C:
11235    case CallingConv::X86_StdCall: {
11236      // Pass 'nest' parameter in ECX.
11237      // Must be kept in sync with X86CallingConv.td
11238      NestReg = X86::ECX;
11239
11240      // Check that ECX wasn't needed by an 'inreg' parameter.
11241      FunctionType *FTy = Func->getFunctionType();
11242      const AttributeSet &Attrs = Func->getAttributes();
11243
11244      if (!Attrs.isEmpty() && !Func->isVarArg()) {
11245        unsigned InRegCount = 0;
11246        unsigned Idx = 1;
11247
11248        for (FunctionType::param_iterator I = FTy->param_begin(),
11249             E = FTy->param_end(); I != E; ++I, ++Idx)
11250          if (Attrs.hasAttribute(Idx, Attribute::InReg))
11251            // FIXME: should only count parameters that are lowered to integers.
11252            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
11253
11254        if (InRegCount > 2) {
11255          report_fatal_error("Nest register in use - reduce number of inreg"
11256                             " parameters!");
11257        }
11258      }
11259      break;
11260    }
11261    case CallingConv::X86_FastCall:
11262    case CallingConv::X86_ThisCall:
11263    case CallingConv::Fast:
11264      // Pass 'nest' parameter in EAX.
11265      // Must be kept in sync with X86CallingConv.td
11266      NestReg = X86::EAX;
11267      break;
11268    }
11269
11270    SDValue OutChains[4];
11271    SDValue Addr, Disp;
11272
11273    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
11274                       DAG.getConstant(10, MVT::i32));
11275    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
11276
11277    // This is storing the opcode for MOV32ri.
11278    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
11279    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
11280    OutChains[0] = DAG.getStore(Root, dl,
11281                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
11282                                Trmp, MachinePointerInfo(TrmpAddr),
11283                                false, false, 0);
11284
11285    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
11286                       DAG.getConstant(1, MVT::i32));
11287    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
11288                                MachinePointerInfo(TrmpAddr, 1),
11289                                false, false, 1);
11290
11291    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
11292    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
11293                       DAG.getConstant(5, MVT::i32));
11294    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
11295                                MachinePointerInfo(TrmpAddr, 5),
11296                                false, false, 1);
11297
11298    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
11299                       DAG.getConstant(6, MVT::i32));
11300    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
11301                                MachinePointerInfo(TrmpAddr, 6),
11302                                false, false, 1);
11303
11304    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
11305  }
11306}
11307
11308SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
11309                                            SelectionDAG &DAG) const {
11310  /*
11311   The rounding mode is in bits 11:10 of FPSR, and has the following
11312   settings:
11313     00 Round to nearest
11314     01 Round to -inf
11315     10 Round to +inf
11316     11 Round to 0
11317
11318  FLT_ROUNDS, on the other hand, expects the following:
11319    -1 Undefined
11320     0 Round to 0
11321     1 Round to nearest
11322     2 Round to +inf
11323     3 Round to -inf
11324
11325  To perform the conversion, we do:
11326    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
11327  */
11328
11329  MachineFunction &MF = DAG.getMachineFunction();
11330  const TargetMachine &TM = MF.getTarget();
11331  const TargetFrameLowering &TFI = *TM.getFrameLowering();
11332  unsigned StackAlignment = TFI.getStackAlignment();
11333  EVT VT = Op.getValueType();
11334  SDLoc DL(Op);
11335
11336  // Save FP Control Word to stack slot
11337  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
11338  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
11339
11340  MachineMemOperand *MMO =
11341   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
11342                           MachineMemOperand::MOStore, 2, 2);
11343
11344  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
11345  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
11346                                          DAG.getVTList(MVT::Other),
11347                                          Ops, array_lengthof(Ops), MVT::i16,
11348                                          MMO);
11349
11350  // Load FP Control Word from stack slot
11351  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
11352                            MachinePointerInfo(), false, false, false, 0);
11353
11354  // Transform as necessary
11355  SDValue CWD1 =
11356    DAG.getNode(ISD::SRL, DL, MVT::i16,
11357                DAG.getNode(ISD::AND, DL, MVT::i16,
11358                            CWD, DAG.getConstant(0x800, MVT::i16)),
11359                DAG.getConstant(11, MVT::i8));
11360  SDValue CWD2 =
11361    DAG.getNode(ISD::SRL, DL, MVT::i16,
11362                DAG.getNode(ISD::AND, DL, MVT::i16,
11363                            CWD, DAG.getConstant(0x400, MVT::i16)),
11364                DAG.getConstant(9, MVT::i8));
11365
11366  SDValue RetVal =
11367    DAG.getNode(ISD::AND, DL, MVT::i16,
11368                DAG.getNode(ISD::ADD, DL, MVT::i16,
11369                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
11370                            DAG.getConstant(1, MVT::i16)),
11371                DAG.getConstant(3, MVT::i16));
11372
11373  return DAG.getNode((VT.getSizeInBits() < 16 ?
11374                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
11375}
11376
11377static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
11378  EVT VT = Op.getValueType();
11379  EVT OpVT = VT;
11380  unsigned NumBits = VT.getSizeInBits();
11381  SDLoc dl(Op);
11382
11383  Op = Op.getOperand(0);
11384  if (VT == MVT::i8) {
11385    // Zero extend to i32 since there is not an i8 bsr.
11386    OpVT = MVT::i32;
11387    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
11388  }
11389
11390  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
11391  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
11392  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
11393
11394  // If src is zero (i.e. bsr sets ZF), returns NumBits.
11395  SDValue Ops[] = {
11396    Op,
11397    DAG.getConstant(NumBits+NumBits-1, OpVT),
11398    DAG.getConstant(X86::COND_E, MVT::i8),
11399    Op.getValue(1)
11400  };
11401  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
11402
11403  // Finally xor with NumBits-1.
11404  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
11405
11406  if (VT == MVT::i8)
11407    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
11408  return Op;
11409}
11410
11411static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
11412  EVT VT = Op.getValueType();
11413  EVT OpVT = VT;
11414  unsigned NumBits = VT.getSizeInBits();
11415  SDLoc dl(Op);
11416
11417  Op = Op.getOperand(0);
11418  if (VT == MVT::i8) {
11419    // Zero extend to i32 since there is not an i8 bsr.
11420    OpVT = MVT::i32;
11421    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
11422  }
11423
11424  // Issue a bsr (scan bits in reverse).
11425  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
11426  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
11427
11428  // And xor with NumBits-1.
11429  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
11430
11431  if (VT == MVT::i8)
11432    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
11433  return Op;
11434}
11435
11436static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
11437  EVT VT = Op.getValueType();
11438  unsigned NumBits = VT.getSizeInBits();
11439  SDLoc dl(Op);
11440  Op = Op.getOperand(0);
11441
11442  // Issue a bsf (scan bits forward) which also sets EFLAGS.
11443  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
11444  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
11445
11446  // If src is zero (i.e. bsf sets ZF), returns NumBits.
11447  SDValue Ops[] = {
11448    Op,
11449    DAG.getConstant(NumBits, VT),
11450    DAG.getConstant(X86::COND_E, MVT::i8),
11451    Op.getValue(1)
11452  };
11453  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
11454}
11455
11456// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
11457// ones, and then concatenate the result back.
11458static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
11459  EVT VT = Op.getValueType();
11460
11461  assert(VT.is256BitVector() && VT.isInteger() &&
11462         "Unsupported value type for operation");
11463
11464  unsigned NumElems = VT.getVectorNumElements();
11465  SDLoc dl(Op);
11466
11467  // Extract the LHS vectors
11468  SDValue LHS = Op.getOperand(0);
11469  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
11470  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
11471
11472  // Extract the RHS vectors
11473  SDValue RHS = Op.getOperand(1);
11474  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
11475  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
11476
11477  MVT EltVT = VT.getVectorElementType().getSimpleVT();
11478  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
11479
11480  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
11481                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
11482                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
11483}
11484
11485static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
11486  assert(Op.getValueType().is256BitVector() &&
11487         Op.getValueType().isInteger() &&
11488         "Only handle AVX 256-bit vector integer operation");
11489  return Lower256IntArith(Op, DAG);
11490}
11491
11492static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
11493  assert(Op.getValueType().is256BitVector() &&
11494         Op.getValueType().isInteger() &&
11495         "Only handle AVX 256-bit vector integer operation");
11496  return Lower256IntArith(Op, DAG);
11497}
11498
11499static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
11500                        SelectionDAG &DAG) {
11501  SDLoc dl(Op);
11502  EVT VT = Op.getValueType();
11503
11504  // Decompose 256-bit ops into smaller 128-bit ops.
11505  if (VT.is256BitVector() && !Subtarget->hasInt256())
11506    return Lower256IntArith(Op, DAG);
11507
11508  SDValue A = Op.getOperand(0);
11509  SDValue B = Op.getOperand(1);
11510
11511  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
11512  if (VT == MVT::v4i32) {
11513    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
11514           "Should not custom lower when pmuldq is available!");
11515
11516    // Extract the odd parts.
11517    static const int UnpackMask[] = { 1, -1, 3, -1 };
11518    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
11519    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
11520
11521    // Multiply the even parts.
11522    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
11523    // Now multiply odd parts.
11524    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
11525
11526    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
11527    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
11528
11529    // Merge the two vectors back together with a shuffle. This expands into 2
11530    // shuffles.
11531    static const int ShufMask[] = { 0, 4, 2, 6 };
11532    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
11533  }
11534
11535  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
11536         "Only know how to lower V2I64/V4I64 multiply");
11537
11538  //  Ahi = psrlqi(a, 32);
11539  //  Bhi = psrlqi(b, 32);
11540  //
11541  //  AloBlo = pmuludq(a, b);
11542  //  AloBhi = pmuludq(a, Bhi);
11543  //  AhiBlo = pmuludq(Ahi, b);
11544
11545  //  AloBhi = psllqi(AloBhi, 32);
11546  //  AhiBlo = psllqi(AhiBlo, 32);
11547  //  return AloBlo + AloBhi + AhiBlo;
11548
11549  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
11550
11551  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
11552  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
11553
11554  // Bit cast to 32-bit vectors for MULUDQ
11555  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
11556  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
11557  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
11558  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
11559  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
11560
11561  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
11562  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
11563  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
11564
11565  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
11566  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
11567
11568  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
11569  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
11570}
11571
11572SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
11573  EVT VT = Op.getValueType();
11574  EVT EltTy = VT.getVectorElementType();
11575  unsigned NumElts = VT.getVectorNumElements();
11576  SDValue N0 = Op.getOperand(0);
11577  SDLoc dl(Op);
11578
11579  // Lower sdiv X, pow2-const.
11580  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
11581  if (!C)
11582    return SDValue();
11583
11584  APInt SplatValue, SplatUndef;
11585  unsigned SplatBitSize;
11586  bool HasAnyUndefs;
11587  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
11588                          HasAnyUndefs) ||
11589      EltTy.getSizeInBits() < SplatBitSize)
11590    return SDValue();
11591
11592  if ((SplatValue != 0) &&
11593      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
11594    unsigned lg2 = SplatValue.countTrailingZeros();
11595    // Splat the sign bit.
11596    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
11597    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
11598    // Add (N0 < 0) ? abs2 - 1 : 0;
11599    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
11600    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
11601    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
11602    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
11603    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
11604
11605    // If we're dividing by a positive value, we're done.  Otherwise, we must
11606    // negate the result.
11607    if (SplatValue.isNonNegative())
11608      return SRA;
11609
11610    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
11611    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
11612    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
11613  }
11614  return SDValue();
11615}
11616
11617static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
11618                                         const X86Subtarget *Subtarget) {
11619  EVT VT = Op.getValueType();
11620  SDLoc dl(Op);
11621  SDValue R = Op.getOperand(0);
11622  SDValue Amt = Op.getOperand(1);
11623
11624  // Optimize shl/srl/sra with constant shift amount.
11625  if (isSplatVector(Amt.getNode())) {
11626    SDValue SclrAmt = Amt->getOperand(0);
11627    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
11628      uint64_t ShiftAmt = C->getZExtValue();
11629
11630      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
11631          (Subtarget->hasInt256() &&
11632           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
11633        if (Op.getOpcode() == ISD::SHL)
11634          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
11635                             DAG.getConstant(ShiftAmt, MVT::i32));
11636        if (Op.getOpcode() == ISD::SRL)
11637          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
11638                             DAG.getConstant(ShiftAmt, MVT::i32));
11639        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
11640          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
11641                             DAG.getConstant(ShiftAmt, MVT::i32));
11642      }
11643
11644      if (VT == MVT::v16i8) {
11645        if (Op.getOpcode() == ISD::SHL) {
11646          // Make a large shift.
11647          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
11648                                    DAG.getConstant(ShiftAmt, MVT::i32));
11649          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
11650          // Zero out the rightmost bits.
11651          SmallVector<SDValue, 16> V(16,
11652                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
11653                                                     MVT::i8));
11654          return DAG.getNode(ISD::AND, dl, VT, SHL,
11655                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
11656        }
11657        if (Op.getOpcode() == ISD::SRL) {
11658          // Make a large shift.
11659          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
11660                                    DAG.getConstant(ShiftAmt, MVT::i32));
11661          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
11662          // Zero out the leftmost bits.
11663          SmallVector<SDValue, 16> V(16,
11664                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
11665                                                     MVT::i8));
11666          return DAG.getNode(ISD::AND, dl, VT, SRL,
11667                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
11668        }
11669        if (Op.getOpcode() == ISD::SRA) {
11670          if (ShiftAmt == 7) {
11671            // R s>> 7  ===  R s< 0
11672            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
11673            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
11674          }
11675
11676          // R s>> a === ((R u>> a) ^ m) - m
11677          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
11678          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
11679                                                         MVT::i8));
11680          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
11681          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
11682          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
11683          return Res;
11684        }
11685        llvm_unreachable("Unknown shift opcode.");
11686      }
11687
11688      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
11689        if (Op.getOpcode() == ISD::SHL) {
11690          // Make a large shift.
11691          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
11692                                    DAG.getConstant(ShiftAmt, MVT::i32));
11693          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
11694          // Zero out the rightmost bits.
11695          SmallVector<SDValue, 32> V(32,
11696                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
11697                                                     MVT::i8));
11698          return DAG.getNode(ISD::AND, dl, VT, SHL,
11699                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
11700        }
11701        if (Op.getOpcode() == ISD::SRL) {
11702          // Make a large shift.
11703          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
11704                                    DAG.getConstant(ShiftAmt, MVT::i32));
11705          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
11706          // Zero out the leftmost bits.
11707          SmallVector<SDValue, 32> V(32,
11708                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
11709                                                     MVT::i8));
11710          return DAG.getNode(ISD::AND, dl, VT, SRL,
11711                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
11712        }
11713        if (Op.getOpcode() == ISD::SRA) {
11714          if (ShiftAmt == 7) {
11715            // R s>> 7  ===  R s< 0
11716            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
11717            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
11718          }
11719
11720          // R s>> a === ((R u>> a) ^ m) - m
11721          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
11722          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
11723                                                         MVT::i8));
11724          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
11725          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
11726          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
11727          return Res;
11728        }
11729        llvm_unreachable("Unknown shift opcode.");
11730      }
11731    }
11732  }
11733
11734  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
11735  if (!Subtarget->is64Bit() &&
11736      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
11737      Amt.getOpcode() == ISD::BITCAST &&
11738      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
11739    Amt = Amt.getOperand(0);
11740    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
11741                     VT.getVectorNumElements();
11742    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
11743    uint64_t ShiftAmt = 0;
11744    for (unsigned i = 0; i != Ratio; ++i) {
11745      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
11746      if (C == 0)
11747        return SDValue();
11748      // 6 == Log2(64)
11749      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
11750    }
11751    // Check remaining shift amounts.
11752    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
11753      uint64_t ShAmt = 0;
11754      for (unsigned j = 0; j != Ratio; ++j) {
11755        ConstantSDNode *C =
11756          dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
11757        if (C == 0)
11758          return SDValue();
11759        // 6 == Log2(64)
11760        ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
11761      }
11762      if (ShAmt != ShiftAmt)
11763        return SDValue();
11764    }
11765    switch (Op.getOpcode()) {
11766    default:
11767      llvm_unreachable("Unknown shift opcode!");
11768    case ISD::SHL:
11769      return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
11770                         DAG.getConstant(ShiftAmt, MVT::i32));
11771    case ISD::SRL:
11772      return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
11773                         DAG.getConstant(ShiftAmt, MVT::i32));
11774    case ISD::SRA:
11775      return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
11776                         DAG.getConstant(ShiftAmt, MVT::i32));
11777    }
11778  }
11779
11780  return SDValue();
11781}
11782
11783static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
11784                                        const X86Subtarget* Subtarget) {
11785  EVT VT = Op.getValueType();
11786  SDLoc dl(Op);
11787  SDValue R = Op.getOperand(0);
11788  SDValue Amt = Op.getOperand(1);
11789
11790  if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
11791      VT == MVT::v4i32 || VT == MVT::v8i16 ||
11792      (Subtarget->hasInt256() &&
11793       ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
11794        VT == MVT::v8i32 || VT == MVT::v16i16))) {
11795    SDValue BaseShAmt;
11796    EVT EltVT = VT.getVectorElementType();
11797
11798    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
11799      unsigned NumElts = VT.getVectorNumElements();
11800      unsigned i, j;
11801      for (i = 0; i != NumElts; ++i) {
11802        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
11803          continue;
11804        break;
11805      }
11806      for (j = i; j != NumElts; ++j) {
11807        SDValue Arg = Amt.getOperand(j);
11808        if (Arg.getOpcode() == ISD::UNDEF) continue;
11809        if (Arg != Amt.getOperand(i))
11810          break;
11811      }
11812      if (i != NumElts && j == NumElts)
11813        BaseShAmt = Amt.getOperand(i);
11814    } else {
11815      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
11816        Amt = Amt.getOperand(0);
11817      if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
11818               cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
11819        SDValue InVec = Amt.getOperand(0);
11820        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
11821          unsigned NumElts = InVec.getValueType().getVectorNumElements();
11822          unsigned i = 0;
11823          for (; i != NumElts; ++i) {
11824            SDValue Arg = InVec.getOperand(i);
11825            if (Arg.getOpcode() == ISD::UNDEF) continue;
11826            BaseShAmt = Arg;
11827            break;
11828          }
11829        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
11830           if (ConstantSDNode *C =
11831               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
11832             unsigned SplatIdx =
11833               cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
11834             if (C->getZExtValue() == SplatIdx)
11835               BaseShAmt = InVec.getOperand(1);
11836           }
11837        }
11838        if (BaseShAmt.getNode() == 0)
11839          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
11840                                  DAG.getIntPtrConstant(0));
11841      }
11842    }
11843
11844    if (BaseShAmt.getNode()) {
11845      if (EltVT.bitsGT(MVT::i32))
11846        BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
11847      else if (EltVT.bitsLT(MVT::i32))
11848        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
11849
11850      switch (Op.getOpcode()) {
11851      default:
11852        llvm_unreachable("Unknown shift opcode!");
11853      case ISD::SHL:
11854        switch (VT.getSimpleVT().SimpleTy) {
11855        default: return SDValue();
11856        case MVT::v2i64:
11857        case MVT::v4i32:
11858        case MVT::v8i16:
11859        case MVT::v4i64:
11860        case MVT::v8i32:
11861        case MVT::v16i16:
11862          return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
11863        }
11864      case ISD::SRA:
11865        switch (VT.getSimpleVT().SimpleTy) {
11866        default: return SDValue();
11867        case MVT::v4i32:
11868        case MVT::v8i16:
11869        case MVT::v8i32:
11870        case MVT::v16i16:
11871          return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
11872        }
11873      case ISD::SRL:
11874        switch (VT.getSimpleVT().SimpleTy) {
11875        default: return SDValue();
11876        case MVT::v2i64:
11877        case MVT::v4i32:
11878        case MVT::v8i16:
11879        case MVT::v4i64:
11880        case MVT::v8i32:
11881        case MVT::v16i16:
11882          return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
11883        }
11884      }
11885    }
11886  }
11887
11888  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
11889  if (!Subtarget->is64Bit() &&
11890      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
11891      Amt.getOpcode() == ISD::BITCAST &&
11892      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
11893    Amt = Amt.getOperand(0);
11894    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
11895                     VT.getVectorNumElements();
11896    std::vector<SDValue> Vals(Ratio);
11897    for (unsigned i = 0; i != Ratio; ++i)
11898      Vals[i] = Amt.getOperand(i);
11899    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
11900      for (unsigned j = 0; j != Ratio; ++j)
11901        if (Vals[j] != Amt.getOperand(i + j))
11902          return SDValue();
11903    }
11904    switch (Op.getOpcode()) {
11905    default:
11906      llvm_unreachable("Unknown shift opcode!");
11907    case ISD::SHL:
11908      return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
11909    case ISD::SRL:
11910      return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
11911    case ISD::SRA:
11912      return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
11913    }
11914  }
11915
11916  return SDValue();
11917}
11918
11919SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
11920
11921  EVT VT = Op.getValueType();
11922  SDLoc dl(Op);
11923  SDValue R = Op.getOperand(0);
11924  SDValue Amt = Op.getOperand(1);
11925  SDValue V;
11926
11927  if (!Subtarget->hasSSE2())
11928    return SDValue();
11929
11930  V = LowerScalarImmediateShift(Op, DAG, Subtarget);
11931  if (V.getNode())
11932    return V;
11933
11934  V = LowerScalarVariableShift(Op, DAG, Subtarget);
11935  if (V.getNode())
11936      return V;
11937
11938  // AVX2 has VPSLLV/VPSRAV/VPSRLV.
11939  if (Subtarget->hasInt256()) {
11940    if (Op.getOpcode() == ISD::SRL &&
11941        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
11942         VT == MVT::v4i64 || VT == MVT::v8i32))
11943      return Op;
11944    if (Op.getOpcode() == ISD::SHL &&
11945        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
11946         VT == MVT::v4i64 || VT == MVT::v8i32))
11947      return Op;
11948    if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
11949      return Op;
11950  }
11951
11952  // Lower SHL with variable shift amount.
11953  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
11954    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
11955
11956    Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
11957    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
11958    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
11959    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
11960  }
11961  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
11962    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
11963
11964    // a = a << 5;
11965    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
11966    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
11967
11968    // Turn 'a' into a mask suitable for VSELECT
11969    SDValue VSelM = DAG.getConstant(0x80, VT);
11970    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11971    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11972
11973    SDValue CM1 = DAG.getConstant(0x0f, VT);
11974    SDValue CM2 = DAG.getConstant(0x3f, VT);
11975
11976    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
11977    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
11978    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
11979                            DAG.getConstant(4, MVT::i32), DAG);
11980    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
11981    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
11982
11983    // a += a
11984    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
11985    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11986    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11987
11988    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
11989    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
11990    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
11991                            DAG.getConstant(2, MVT::i32), DAG);
11992    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
11993    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
11994
11995    // a += a
11996    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
11997    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11998    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11999
12000    // return VSELECT(r, r+r, a);
12001    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
12002                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
12003    return R;
12004  }
12005
12006  // Decompose 256-bit shifts into smaller 128-bit shifts.
12007  if (VT.is256BitVector()) {
12008    unsigned NumElems = VT.getVectorNumElements();
12009    MVT EltVT = VT.getVectorElementType().getSimpleVT();
12010    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
12011
12012    // Extract the two vectors
12013    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
12014    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
12015
12016    // Recreate the shift amount vectors
12017    SDValue Amt1, Amt2;
12018    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
12019      // Constant shift amount
12020      SmallVector<SDValue, 4> Amt1Csts;
12021      SmallVector<SDValue, 4> Amt2Csts;
12022      for (unsigned i = 0; i != NumElems/2; ++i)
12023        Amt1Csts.push_back(Amt->getOperand(i));
12024      for (unsigned i = NumElems/2; i != NumElems; ++i)
12025        Amt2Csts.push_back(Amt->getOperand(i));
12026
12027      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
12028                                 &Amt1Csts[0], NumElems/2);
12029      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
12030                                 &Amt2Csts[0], NumElems/2);
12031    } else {
12032      // Variable shift amount
12033      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
12034      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
12035    }
12036
12037    // Issue new vector shifts for the smaller types
12038    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
12039    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
12040
12041    // Concatenate the result back
12042    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
12043  }
12044
12045  return SDValue();
12046}
12047
12048static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
12049  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
12050  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
12051  // looks for this combo and may remove the "setcc" instruction if the "setcc"
12052  // has only one use.
12053  SDNode *N = Op.getNode();
12054  SDValue LHS = N->getOperand(0);
12055  SDValue RHS = N->getOperand(1);
12056  unsigned BaseOp = 0;
12057  unsigned Cond = 0;
12058  SDLoc DL(Op);
12059  switch (Op.getOpcode()) {
12060  default: llvm_unreachable("Unknown ovf instruction!");
12061  case ISD::SADDO:
12062    // A subtract of one will be selected as a INC. Note that INC doesn't
12063    // set CF, so we can't do this for UADDO.
12064    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
12065      if (C->isOne()) {
12066        BaseOp = X86ISD::INC;
12067        Cond = X86::COND_O;
12068        break;
12069      }
12070    BaseOp = X86ISD::ADD;
12071    Cond = X86::COND_O;
12072    break;
12073  case ISD::UADDO:
12074    BaseOp = X86ISD::ADD;
12075    Cond = X86::COND_B;
12076    break;
12077  case ISD::SSUBO:
12078    // A subtract of one will be selected as a DEC. Note that DEC doesn't
12079    // set CF, so we can't do this for USUBO.
12080    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
12081      if (C->isOne()) {
12082        BaseOp = X86ISD::DEC;
12083        Cond = X86::COND_O;
12084        break;
12085      }
12086    BaseOp = X86ISD::SUB;
12087    Cond = X86::COND_O;
12088    break;
12089  case ISD::USUBO:
12090    BaseOp = X86ISD::SUB;
12091    Cond = X86::COND_B;
12092    break;
12093  case ISD::SMULO:
12094    BaseOp = X86ISD::SMUL;
12095    Cond = X86::COND_O;
12096    break;
12097  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
12098    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
12099                                 MVT::i32);
12100    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
12101
12102    SDValue SetCC =
12103      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
12104                  DAG.getConstant(X86::COND_O, MVT::i32),
12105                  SDValue(Sum.getNode(), 2));
12106
12107    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
12108  }
12109  }
12110
12111  // Also sets EFLAGS.
12112  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
12113  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
12114
12115  SDValue SetCC =
12116    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
12117                DAG.getConstant(Cond, MVT::i32),
12118                SDValue(Sum.getNode(), 1));
12119
12120  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
12121}
12122
12123SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
12124                                                  SelectionDAG &DAG) const {
12125  SDLoc dl(Op);
12126  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
12127  EVT VT = Op.getValueType();
12128
12129  if (!Subtarget->hasSSE2() || !VT.isVector())
12130    return SDValue();
12131
12132  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
12133                      ExtraVT.getScalarType().getSizeInBits();
12134  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
12135
12136  switch (VT.getSimpleVT().SimpleTy) {
12137    default: return SDValue();
12138    case MVT::v8i32:
12139    case MVT::v16i16:
12140      if (!Subtarget->hasFp256())
12141        return SDValue();
12142      if (!Subtarget->hasInt256()) {
12143        // needs to be split
12144        unsigned NumElems = VT.getVectorNumElements();
12145
12146        // Extract the LHS vectors
12147        SDValue LHS = Op.getOperand(0);
12148        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
12149        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
12150
12151        MVT EltVT = VT.getVectorElementType().getSimpleVT();
12152        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
12153
12154        EVT ExtraEltVT = ExtraVT.getVectorElementType();
12155        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
12156        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
12157                                   ExtraNumElems/2);
12158        SDValue Extra = DAG.getValueType(ExtraVT);
12159
12160        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
12161        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
12162
12163        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
12164      }
12165      // fall through
12166    case MVT::v4i32:
12167    case MVT::v8i16: {
12168      // (sext (vzext x)) -> (vsext x)
12169      SDValue Op0 = Op.getOperand(0);
12170      SDValue Op00 = Op0.getOperand(0);
12171      SDValue Tmp1;
12172      // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
12173      if (Op0.getOpcode() == ISD::BITCAST &&
12174          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
12175        Tmp1 = LowerVectorIntExtend(Op00, DAG);
12176      if (Tmp1.getNode()) {
12177        SDValue Tmp1Op0 = Tmp1.getOperand(0);
12178        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
12179               "This optimization is invalid without a VZEXT.");
12180        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
12181      }
12182
12183      // If the above didn't work, then just use Shift-Left + Shift-Right.
12184      Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
12185      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
12186    }
12187  }
12188}
12189
12190static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
12191                                 SelectionDAG &DAG) {
12192  SDLoc dl(Op);
12193  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
12194    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
12195  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
12196    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
12197
12198  // The only fence that needs an instruction is a sequentially-consistent
12199  // cross-thread fence.
12200  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
12201    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
12202    // no-sse2). There isn't any reason to disable it if the target processor
12203    // supports it.
12204    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
12205      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
12206
12207    SDValue Chain = Op.getOperand(0);
12208    SDValue Zero = DAG.getConstant(0, MVT::i32);
12209    SDValue Ops[] = {
12210      DAG.getRegister(X86::ESP, MVT::i32), // Base
12211      DAG.getTargetConstant(1, MVT::i8),   // Scale
12212      DAG.getRegister(0, MVT::i32),        // Index
12213      DAG.getTargetConstant(0, MVT::i32),  // Disp
12214      DAG.getRegister(0, MVT::i32),        // Segment.
12215      Zero,
12216      Chain
12217    };
12218    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
12219    return SDValue(Res, 0);
12220  }
12221
12222  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
12223  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
12224}
12225
12226static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
12227                             SelectionDAG &DAG) {
12228  EVT T = Op.getValueType();
12229  SDLoc DL(Op);
12230  unsigned Reg = 0;
12231  unsigned size = 0;
12232  switch(T.getSimpleVT().SimpleTy) {
12233  default: llvm_unreachable("Invalid value type!");
12234  case MVT::i8:  Reg = X86::AL;  size = 1; break;
12235  case MVT::i16: Reg = X86::AX;  size = 2; break;
12236  case MVT::i32: Reg = X86::EAX; size = 4; break;
12237  case MVT::i64:
12238    assert(Subtarget->is64Bit() && "Node not type legal!");
12239    Reg = X86::RAX; size = 8;
12240    break;
12241  }
12242  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
12243                                    Op.getOperand(2), SDValue());
12244  SDValue Ops[] = { cpIn.getValue(0),
12245                    Op.getOperand(1),
12246                    Op.getOperand(3),
12247                    DAG.getTargetConstant(size, MVT::i8),
12248                    cpIn.getValue(1) };
12249  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
12250  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
12251  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
12252                                           Ops, array_lengthof(Ops), T, MMO);
12253  SDValue cpOut =
12254    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
12255  return cpOut;
12256}
12257
12258static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
12259                                     SelectionDAG &DAG) {
12260  assert(Subtarget->is64Bit() && "Result not type legalized?");
12261  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
12262  SDValue TheChain = Op.getOperand(0);
12263  SDLoc dl(Op);
12264  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
12265  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
12266  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
12267                                   rax.getValue(2));
12268  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
12269                            DAG.getConstant(32, MVT::i8));
12270  SDValue Ops[] = {
12271    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
12272    rdx.getValue(1)
12273  };
12274  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
12275}
12276
12277SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
12278  EVT SrcVT = Op.getOperand(0).getValueType();
12279  EVT DstVT = Op.getValueType();
12280  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
12281         Subtarget->hasMMX() && "Unexpected custom BITCAST");
12282  assert((DstVT == MVT::i64 ||
12283          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
12284         "Unexpected custom BITCAST");
12285  // i64 <=> MMX conversions are Legal.
12286  if (SrcVT==MVT::i64 && DstVT.isVector())
12287    return Op;
12288  if (DstVT==MVT::i64 && SrcVT.isVector())
12289    return Op;
12290  // MMX <=> MMX conversions are Legal.
12291  if (SrcVT.isVector() && DstVT.isVector())
12292    return Op;
12293  // All other conversions need to be expanded.
12294  return SDValue();
12295}
12296
12297static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
12298  SDNode *Node = Op.getNode();
12299  SDLoc dl(Node);
12300  EVT T = Node->getValueType(0);
12301  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
12302                              DAG.getConstant(0, T), Node->getOperand(2));
12303  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
12304                       cast<AtomicSDNode>(Node)->getMemoryVT(),
12305                       Node->getOperand(0),
12306                       Node->getOperand(1), negOp,
12307                       cast<AtomicSDNode>(Node)->getSrcValue(),
12308                       cast<AtomicSDNode>(Node)->getAlignment(),
12309                       cast<AtomicSDNode>(Node)->getOrdering(),
12310                       cast<AtomicSDNode>(Node)->getSynchScope());
12311}
12312
12313static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
12314  SDNode *Node = Op.getNode();
12315  SDLoc dl(Node);
12316  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
12317
12318  // Convert seq_cst store -> xchg
12319  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
12320  // FIXME: On 32-bit, store -> fist or movq would be more efficient
12321  //        (The only way to get a 16-byte store is cmpxchg16b)
12322  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
12323  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
12324      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
12325    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
12326                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
12327                                 Node->getOperand(0),
12328                                 Node->getOperand(1), Node->getOperand(2),
12329                                 cast<AtomicSDNode>(Node)->getMemOperand(),
12330                                 cast<AtomicSDNode>(Node)->getOrdering(),
12331                                 cast<AtomicSDNode>(Node)->getSynchScope());
12332    return Swap.getValue(1);
12333  }
12334  // Other atomic stores have a simple pattern.
12335  return Op;
12336}
12337
12338static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
12339  EVT VT = Op.getNode()->getValueType(0);
12340
12341  // Let legalize expand this if it isn't a legal type yet.
12342  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
12343    return SDValue();
12344
12345  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
12346
12347  unsigned Opc;
12348  bool ExtraOp = false;
12349  switch (Op.getOpcode()) {
12350  default: llvm_unreachable("Invalid code");
12351  case ISD::ADDC: Opc = X86ISD::ADD; break;
12352  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
12353  case ISD::SUBC: Opc = X86ISD::SUB; break;
12354  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
12355  }
12356
12357  if (!ExtraOp)
12358    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
12359                       Op.getOperand(1));
12360  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
12361                     Op.getOperand(1), Op.getOperand(2));
12362}
12363
12364SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
12365  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
12366
12367  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
12368  // which returns the values as { float, float } (in XMM0) or
12369  // { double, double } (which is returned in XMM0, XMM1).
12370  SDLoc dl(Op);
12371  SDValue Arg = Op.getOperand(0);
12372  EVT ArgVT = Arg.getValueType();
12373  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
12374
12375  ArgListTy Args;
12376  ArgListEntry Entry;
12377
12378  Entry.Node = Arg;
12379  Entry.Ty = ArgTy;
12380  Entry.isSExt = false;
12381  Entry.isZExt = false;
12382  Args.push_back(Entry);
12383
12384  bool isF64 = ArgVT == MVT::f64;
12385  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
12386  // the small struct {f32, f32} is returned in (eax, edx). For f64,
12387  // the results are returned via SRet in memory.
12388  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
12389  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
12390
12391  Type *RetTy = isF64
12392    ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
12393    : (Type*)VectorType::get(ArgTy, 4);
12394  TargetLowering::
12395    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
12396                         false, false, false, false, 0,
12397                         CallingConv::C, /*isTaillCall=*/false,
12398                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
12399                         Callee, Args, DAG, dl);
12400  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
12401
12402  if (isF64)
12403    // Returned in xmm0 and xmm1.
12404    return CallResult.first;
12405
12406  // Returned in bits 0:31 and 32:64 xmm0.
12407  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
12408                               CallResult.first, DAG.getIntPtrConstant(0));
12409  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
12410                               CallResult.first, DAG.getIntPtrConstant(1));
12411  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
12412  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
12413}
12414
12415/// LowerOperation - Provide custom lowering hooks for some operations.
12416///
12417SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
12418  switch (Op.getOpcode()) {
12419  default: llvm_unreachable("Should not custom lower this!");
12420  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
12421  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
12422  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
12423  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
12424  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
12425  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
12426  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
12427  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
12428  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
12429  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
12430  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
12431  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
12432  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
12433  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
12434  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
12435  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
12436  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
12437  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
12438  case ISD::SHL_PARTS:
12439  case ISD::SRA_PARTS:
12440  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
12441  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
12442  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
12443  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
12444  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, DAG);
12445  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, DAG);
12446  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, DAG);
12447  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
12448  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
12449  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
12450  case ISD::FABS:               return LowerFABS(Op, DAG);
12451  case ISD::FNEG:               return LowerFNEG(Op, DAG);
12452  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
12453  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
12454  case ISD::SETCC:              return LowerSETCC(Op, DAG);
12455  case ISD::SELECT:             return LowerSELECT(Op, DAG);
12456  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
12457  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
12458  case ISD::VASTART:            return LowerVASTART(Op, DAG);
12459  case ISD::VAARG:              return LowerVAARG(Op, DAG);
12460  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
12461  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
12462  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
12463  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
12464  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
12465  case ISD::FRAME_TO_ARGS_OFFSET:
12466                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
12467  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
12468  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
12469  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
12470  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
12471  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
12472  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
12473  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
12474  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
12475  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
12476  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
12477  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
12478  case ISD::SRA:
12479  case ISD::SRL:
12480  case ISD::SHL:                return LowerShift(Op, DAG);
12481  case ISD::SADDO:
12482  case ISD::UADDO:
12483  case ISD::SSUBO:
12484  case ISD::USUBO:
12485  case ISD::SMULO:
12486  case ISD::UMULO:              return LowerXALUO(Op, DAG);
12487  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
12488  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
12489  case ISD::ADDC:
12490  case ISD::ADDE:
12491  case ISD::SUBC:
12492  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
12493  case ISD::ADD:                return LowerADD(Op, DAG);
12494  case ISD::SUB:                return LowerSUB(Op, DAG);
12495  case ISD::SDIV:               return LowerSDIV(Op, DAG);
12496  case ISD::FSINCOS:            return LowerFSINCOS(Op, DAG);
12497  }
12498}
12499
12500static void ReplaceATOMIC_LOAD(SDNode *Node,
12501                                  SmallVectorImpl<SDValue> &Results,
12502                                  SelectionDAG &DAG) {
12503  SDLoc dl(Node);
12504  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
12505
12506  // Convert wide load -> cmpxchg8b/cmpxchg16b
12507  // FIXME: On 32-bit, load -> fild or movq would be more efficient
12508  //        (The only way to get a 16-byte load is cmpxchg16b)
12509  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
12510  SDValue Zero = DAG.getConstant(0, VT);
12511  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
12512                               Node->getOperand(0),
12513                               Node->getOperand(1), Zero, Zero,
12514                               cast<AtomicSDNode>(Node)->getMemOperand(),
12515                               cast<AtomicSDNode>(Node)->getOrdering(),
12516                               cast<AtomicSDNode>(Node)->getSynchScope());
12517  Results.push_back(Swap.getValue(0));
12518  Results.push_back(Swap.getValue(1));
12519}
12520
12521static void
12522ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
12523                        SelectionDAG &DAG, unsigned NewOp) {
12524  SDLoc dl(Node);
12525  assert (Node->getValueType(0) == MVT::i64 &&
12526          "Only know how to expand i64 atomics");
12527
12528  SDValue Chain = Node->getOperand(0);
12529  SDValue In1 = Node->getOperand(1);
12530  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
12531                             Node->getOperand(2), DAG.getIntPtrConstant(0));
12532  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
12533                             Node->getOperand(2), DAG.getIntPtrConstant(1));
12534  SDValue Ops[] = { Chain, In1, In2L, In2H };
12535  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
12536  SDValue Result =
12537    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
12538                            cast<MemSDNode>(Node)->getMemOperand());
12539  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
12540  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
12541  Results.push_back(Result.getValue(2));
12542}
12543
12544/// ReplaceNodeResults - Replace a node with an illegal result type
12545/// with a new node built out of custom code.
12546void X86TargetLowering::ReplaceNodeResults(SDNode *N,
12547                                           SmallVectorImpl<SDValue>&Results,
12548                                           SelectionDAG &DAG) const {
12549  SDLoc dl(N);
12550  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12551  switch (N->getOpcode()) {
12552  default:
12553    llvm_unreachable("Do not know how to custom type legalize this operation!");
12554  case ISD::SIGN_EXTEND_INREG:
12555  case ISD::ADDC:
12556  case ISD::ADDE:
12557  case ISD::SUBC:
12558  case ISD::SUBE:
12559    // We don't want to expand or promote these.
12560    return;
12561  case ISD::FP_TO_SINT:
12562  case ISD::FP_TO_UINT: {
12563    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
12564
12565    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
12566      return;
12567
12568    std::pair<SDValue,SDValue> Vals =
12569        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
12570    SDValue FIST = Vals.first, StackSlot = Vals.second;
12571    if (FIST.getNode() != 0) {
12572      EVT VT = N->getValueType(0);
12573      // Return a load from the stack slot.
12574      if (StackSlot.getNode() != 0)
12575        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
12576                                      MachinePointerInfo(),
12577                                      false, false, false, 0));
12578      else
12579        Results.push_back(FIST);
12580    }
12581    return;
12582  }
12583  case ISD::UINT_TO_FP: {
12584    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
12585    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
12586        N->getValueType(0) != MVT::v2f32)
12587      return;
12588    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
12589                                 N->getOperand(0));
12590    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
12591                                     MVT::f64);
12592    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
12593    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
12594                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
12595    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
12596    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
12597    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
12598    return;
12599  }
12600  case ISD::FP_ROUND: {
12601    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
12602        return;
12603    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
12604    Results.push_back(V);
12605    return;
12606  }
12607  case ISD::READCYCLECOUNTER: {
12608    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
12609    SDValue TheChain = N->getOperand(0);
12610    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
12611    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
12612                                     rd.getValue(1));
12613    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
12614                                     eax.getValue(2));
12615    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
12616    SDValue Ops[] = { eax, edx };
12617    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
12618                                  array_lengthof(Ops)));
12619    Results.push_back(edx.getValue(1));
12620    return;
12621  }
12622  case ISD::ATOMIC_CMP_SWAP: {
12623    EVT T = N->getValueType(0);
12624    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
12625    bool Regs64bit = T == MVT::i128;
12626    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
12627    SDValue cpInL, cpInH;
12628    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
12629                        DAG.getConstant(0, HalfT));
12630    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
12631                        DAG.getConstant(1, HalfT));
12632    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
12633                             Regs64bit ? X86::RAX : X86::EAX,
12634                             cpInL, SDValue());
12635    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
12636                             Regs64bit ? X86::RDX : X86::EDX,
12637                             cpInH, cpInL.getValue(1));
12638    SDValue swapInL, swapInH;
12639    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
12640                          DAG.getConstant(0, HalfT));
12641    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
12642                          DAG.getConstant(1, HalfT));
12643    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
12644                               Regs64bit ? X86::RBX : X86::EBX,
12645                               swapInL, cpInH.getValue(1));
12646    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
12647                               Regs64bit ? X86::RCX : X86::ECX,
12648                               swapInH, swapInL.getValue(1));
12649    SDValue Ops[] = { swapInH.getValue(0),
12650                      N->getOperand(1),
12651                      swapInH.getValue(1) };
12652    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
12653    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
12654    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
12655                                  X86ISD::LCMPXCHG8_DAG;
12656    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
12657                                             Ops, array_lengthof(Ops), T, MMO);
12658    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
12659                                        Regs64bit ? X86::RAX : X86::EAX,
12660                                        HalfT, Result.getValue(1));
12661    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
12662                                        Regs64bit ? X86::RDX : X86::EDX,
12663                                        HalfT, cpOutL.getValue(2));
12664    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
12665    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
12666    Results.push_back(cpOutH.getValue(1));
12667    return;
12668  }
12669  case ISD::ATOMIC_LOAD_ADD:
12670  case ISD::ATOMIC_LOAD_AND:
12671  case ISD::ATOMIC_LOAD_NAND:
12672  case ISD::ATOMIC_LOAD_OR:
12673  case ISD::ATOMIC_LOAD_SUB:
12674  case ISD::ATOMIC_LOAD_XOR:
12675  case ISD::ATOMIC_LOAD_MAX:
12676  case ISD::ATOMIC_LOAD_MIN:
12677  case ISD::ATOMIC_LOAD_UMAX:
12678  case ISD::ATOMIC_LOAD_UMIN:
12679  case ISD::ATOMIC_SWAP: {
12680    unsigned Opc;
12681    switch (N->getOpcode()) {
12682    default: llvm_unreachable("Unexpected opcode");
12683    case ISD::ATOMIC_LOAD_ADD:
12684      Opc = X86ISD::ATOMADD64_DAG;
12685      break;
12686    case ISD::ATOMIC_LOAD_AND:
12687      Opc = X86ISD::ATOMAND64_DAG;
12688      break;
12689    case ISD::ATOMIC_LOAD_NAND:
12690      Opc = X86ISD::ATOMNAND64_DAG;
12691      break;
12692    case ISD::ATOMIC_LOAD_OR:
12693      Opc = X86ISD::ATOMOR64_DAG;
12694      break;
12695    case ISD::ATOMIC_LOAD_SUB:
12696      Opc = X86ISD::ATOMSUB64_DAG;
12697      break;
12698    case ISD::ATOMIC_LOAD_XOR:
12699      Opc = X86ISD::ATOMXOR64_DAG;
12700      break;
12701    case ISD::ATOMIC_LOAD_MAX:
12702      Opc = X86ISD::ATOMMAX64_DAG;
12703      break;
12704    case ISD::ATOMIC_LOAD_MIN:
12705      Opc = X86ISD::ATOMMIN64_DAG;
12706      break;
12707    case ISD::ATOMIC_LOAD_UMAX:
12708      Opc = X86ISD::ATOMUMAX64_DAG;
12709      break;
12710    case ISD::ATOMIC_LOAD_UMIN:
12711      Opc = X86ISD::ATOMUMIN64_DAG;
12712      break;
12713    case ISD::ATOMIC_SWAP:
12714      Opc = X86ISD::ATOMSWAP64_DAG;
12715      break;
12716    }
12717    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
12718    return;
12719  }
12720  case ISD::ATOMIC_LOAD:
12721    ReplaceATOMIC_LOAD(N, Results, DAG);
12722  }
12723}
12724
12725const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
12726  switch (Opcode) {
12727  default: return NULL;
12728  case X86ISD::BSF:                return "X86ISD::BSF";
12729  case X86ISD::BSR:                return "X86ISD::BSR";
12730  case X86ISD::SHLD:               return "X86ISD::SHLD";
12731  case X86ISD::SHRD:               return "X86ISD::SHRD";
12732  case X86ISD::FAND:               return "X86ISD::FAND";
12733  case X86ISD::FOR:                return "X86ISD::FOR";
12734  case X86ISD::FXOR:               return "X86ISD::FXOR";
12735  case X86ISD::FSRL:               return "X86ISD::FSRL";
12736  case X86ISD::FILD:               return "X86ISD::FILD";
12737  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
12738  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
12739  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
12740  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
12741  case X86ISD::FLD:                return "X86ISD::FLD";
12742  case X86ISD::FST:                return "X86ISD::FST";
12743  case X86ISD::CALL:               return "X86ISD::CALL";
12744  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
12745  case X86ISD::BT:                 return "X86ISD::BT";
12746  case X86ISD::CMP:                return "X86ISD::CMP";
12747  case X86ISD::COMI:               return "X86ISD::COMI";
12748  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
12749  case X86ISD::SETCC:              return "X86ISD::SETCC";
12750  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
12751  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
12752  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
12753  case X86ISD::CMOV:               return "X86ISD::CMOV";
12754  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
12755  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
12756  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
12757  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
12758  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
12759  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
12760  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
12761  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
12762  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
12763  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
12764  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
12765  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
12766  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
12767  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
12768  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
12769  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
12770  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
12771  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
12772  case X86ISD::HADD:               return "X86ISD::HADD";
12773  case X86ISD::HSUB:               return "X86ISD::HSUB";
12774  case X86ISD::FHADD:              return "X86ISD::FHADD";
12775  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
12776  case X86ISD::UMAX:               return "X86ISD::UMAX";
12777  case X86ISD::UMIN:               return "X86ISD::UMIN";
12778  case X86ISD::SMAX:               return "X86ISD::SMAX";
12779  case X86ISD::SMIN:               return "X86ISD::SMIN";
12780  case X86ISD::FMAX:               return "X86ISD::FMAX";
12781  case X86ISD::FMIN:               return "X86ISD::FMIN";
12782  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
12783  case X86ISD::FMINC:              return "X86ISD::FMINC";
12784  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
12785  case X86ISD::FRCP:               return "X86ISD::FRCP";
12786  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
12787  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
12788  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
12789  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
12790  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
12791  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
12792  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
12793  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
12794  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
12795  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
12796  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
12797  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
12798  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
12799  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
12800  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
12801  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
12802  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
12803  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
12804  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
12805  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
12806  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
12807  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
12808  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
12809  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
12810  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
12811  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
12812  case X86ISD::VSHL:               return "X86ISD::VSHL";
12813  case X86ISD::VSRL:               return "X86ISD::VSRL";
12814  case X86ISD::VSRA:               return "X86ISD::VSRA";
12815  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
12816  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
12817  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
12818  case X86ISD::CMPP:               return "X86ISD::CMPP";
12819  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
12820  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
12821  case X86ISD::ADD:                return "X86ISD::ADD";
12822  case X86ISD::SUB:                return "X86ISD::SUB";
12823  case X86ISD::ADC:                return "X86ISD::ADC";
12824  case X86ISD::SBB:                return "X86ISD::SBB";
12825  case X86ISD::SMUL:               return "X86ISD::SMUL";
12826  case X86ISD::UMUL:               return "X86ISD::UMUL";
12827  case X86ISD::INC:                return "X86ISD::INC";
12828  case X86ISD::DEC:                return "X86ISD::DEC";
12829  case X86ISD::OR:                 return "X86ISD::OR";
12830  case X86ISD::XOR:                return "X86ISD::XOR";
12831  case X86ISD::AND:                return "X86ISD::AND";
12832  case X86ISD::BLSI:               return "X86ISD::BLSI";
12833  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
12834  case X86ISD::BLSR:               return "X86ISD::BLSR";
12835  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
12836  case X86ISD::PTEST:              return "X86ISD::PTEST";
12837  case X86ISD::TESTP:              return "X86ISD::TESTP";
12838  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
12839  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
12840  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
12841  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
12842  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
12843  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
12844  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
12845  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
12846  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
12847  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
12848  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
12849  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
12850  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
12851  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
12852  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
12853  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
12854  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
12855  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
12856  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
12857  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
12858  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
12859  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
12860  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
12861  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
12862  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
12863  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
12864  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
12865  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
12866  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
12867  case X86ISD::SAHF:               return "X86ISD::SAHF";
12868  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
12869  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
12870  case X86ISD::FMADD:              return "X86ISD::FMADD";
12871  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
12872  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
12873  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
12874  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
12875  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
12876  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
12877  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
12878  case X86ISD::XTEST:              return "X86ISD::XTEST";
12879  }
12880}
12881
12882// isLegalAddressingMode - Return true if the addressing mode represented
12883// by AM is legal for this target, for a load/store of the specified type.
12884bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
12885                                              Type *Ty) const {
12886  // X86 supports extremely general addressing modes.
12887  CodeModel::Model M = getTargetMachine().getCodeModel();
12888  Reloc::Model R = getTargetMachine().getRelocationModel();
12889
12890  // X86 allows a sign-extended 32-bit immediate field as a displacement.
12891  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
12892    return false;
12893
12894  if (AM.BaseGV) {
12895    unsigned GVFlags =
12896      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
12897
12898    // If a reference to this global requires an extra load, we can't fold it.
12899    if (isGlobalStubReference(GVFlags))
12900      return false;
12901
12902    // If BaseGV requires a register for the PIC base, we cannot also have a
12903    // BaseReg specified.
12904    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
12905      return false;
12906
12907    // If lower 4G is not available, then we must use rip-relative addressing.
12908    if ((M != CodeModel::Small || R != Reloc::Static) &&
12909        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
12910      return false;
12911  }
12912
12913  switch (AM.Scale) {
12914  case 0:
12915  case 1:
12916  case 2:
12917  case 4:
12918  case 8:
12919    // These scales always work.
12920    break;
12921  case 3:
12922  case 5:
12923  case 9:
12924    // These scales are formed with basereg+scalereg.  Only accept if there is
12925    // no basereg yet.
12926    if (AM.HasBaseReg)
12927      return false;
12928    break;
12929  default:  // Other stuff never works.
12930    return false;
12931  }
12932
12933  return true;
12934}
12935
12936bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
12937  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
12938    return false;
12939  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
12940  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
12941  return NumBits1 > NumBits2;
12942}
12943
12944bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
12945  return isInt<32>(Imm);
12946}
12947
12948bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
12949  // Can also use sub to handle negated immediates.
12950  return isInt<32>(Imm);
12951}
12952
12953bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
12954  if (!VT1.isInteger() || !VT2.isInteger())
12955    return false;
12956  unsigned NumBits1 = VT1.getSizeInBits();
12957  unsigned NumBits2 = VT2.getSizeInBits();
12958  return NumBits1 > NumBits2;
12959}
12960
12961bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
12962  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
12963  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
12964}
12965
12966bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
12967  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
12968  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
12969}
12970
12971bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
12972  EVT VT1 = Val.getValueType();
12973  if (isZExtFree(VT1, VT2))
12974    return true;
12975
12976  if (Val.getOpcode() != ISD::LOAD)
12977    return false;
12978
12979  if (!VT1.isSimple() || !VT1.isInteger() ||
12980      !VT2.isSimple() || !VT2.isInteger())
12981    return false;
12982
12983  switch (VT1.getSimpleVT().SimpleTy) {
12984  default: break;
12985  case MVT::i8:
12986  case MVT::i16:
12987  case MVT::i32:
12988    // X86 has 8, 16, and 32-bit zero-extending loads.
12989    return true;
12990  }
12991
12992  return false;
12993}
12994
12995bool
12996X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
12997  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
12998    return false;
12999
13000  VT = VT.getScalarType();
13001
13002  if (!VT.isSimple())
13003    return false;
13004
13005  switch (VT.getSimpleVT().SimpleTy) {
13006  case MVT::f32:
13007  case MVT::f64:
13008    return true;
13009  default:
13010    break;
13011  }
13012
13013  return false;
13014}
13015
13016bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
13017  // i16 instructions are longer (0x66 prefix) and potentially slower.
13018  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
13019}
13020
13021/// isShuffleMaskLegal - Targets can use this to indicate that they only
13022/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
13023/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
13024/// are assumed to be legal.
13025bool
13026X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
13027                                      EVT VT) const {
13028  // Very little shuffling can be done for 64-bit vectors right now.
13029  if (VT.getSizeInBits() == 64)
13030    return false;
13031
13032  // FIXME: pshufb, blends, shifts.
13033  return (VT.getVectorNumElements() == 2 ||
13034          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
13035          isMOVLMask(M, VT) ||
13036          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
13037          isPSHUFDMask(M, VT) ||
13038          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
13039          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
13040          isPALIGNRMask(M, VT, Subtarget) ||
13041          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
13042          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
13043          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
13044          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
13045}
13046
13047bool
13048X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
13049                                          EVT VT) const {
13050  unsigned NumElts = VT.getVectorNumElements();
13051  // FIXME: This collection of masks seems suspect.
13052  if (NumElts == 2)
13053    return true;
13054  if (NumElts == 4 && VT.is128BitVector()) {
13055    return (isMOVLMask(Mask, VT)  ||
13056            isCommutedMOVLMask(Mask, VT, true) ||
13057            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
13058            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
13059  }
13060  return false;
13061}
13062
13063//===----------------------------------------------------------------------===//
13064//                           X86 Scheduler Hooks
13065//===----------------------------------------------------------------------===//
13066
13067/// Utility function to emit xbegin specifying the start of an RTM region.
13068static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
13069                                     const TargetInstrInfo *TII) {
13070  DebugLoc DL = MI->getDebugLoc();
13071
13072  const BasicBlock *BB = MBB->getBasicBlock();
13073  MachineFunction::iterator I = MBB;
13074  ++I;
13075
13076  // For the v = xbegin(), we generate
13077  //
13078  // thisMBB:
13079  //  xbegin sinkMBB
13080  //
13081  // mainMBB:
13082  //  eax = -1
13083  //
13084  // sinkMBB:
13085  //  v = eax
13086
13087  MachineBasicBlock *thisMBB = MBB;
13088  MachineFunction *MF = MBB->getParent();
13089  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13090  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13091  MF->insert(I, mainMBB);
13092  MF->insert(I, sinkMBB);
13093
13094  // Transfer the remainder of BB and its successor edges to sinkMBB.
13095  sinkMBB->splice(sinkMBB->begin(), MBB,
13096                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
13097  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
13098
13099  // thisMBB:
13100  //  xbegin sinkMBB
13101  //  # fallthrough to mainMBB
13102  //  # abortion to sinkMBB
13103  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
13104  thisMBB->addSuccessor(mainMBB);
13105  thisMBB->addSuccessor(sinkMBB);
13106
13107  // mainMBB:
13108  //  EAX = -1
13109  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
13110  mainMBB->addSuccessor(sinkMBB);
13111
13112  // sinkMBB:
13113  // EAX is live into the sinkMBB
13114  sinkMBB->addLiveIn(X86::EAX);
13115  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13116          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
13117    .addReg(X86::EAX);
13118
13119  MI->eraseFromParent();
13120  return sinkMBB;
13121}
13122
13123// Get CMPXCHG opcode for the specified data type.
13124static unsigned getCmpXChgOpcode(EVT VT) {
13125  switch (VT.getSimpleVT().SimpleTy) {
13126  case MVT::i8:  return X86::LCMPXCHG8;
13127  case MVT::i16: return X86::LCMPXCHG16;
13128  case MVT::i32: return X86::LCMPXCHG32;
13129  case MVT::i64: return X86::LCMPXCHG64;
13130  default:
13131    break;
13132  }
13133  llvm_unreachable("Invalid operand size!");
13134}
13135
13136// Get LOAD opcode for the specified data type.
13137static unsigned getLoadOpcode(EVT VT) {
13138  switch (VT.getSimpleVT().SimpleTy) {
13139  case MVT::i8:  return X86::MOV8rm;
13140  case MVT::i16: return X86::MOV16rm;
13141  case MVT::i32: return X86::MOV32rm;
13142  case MVT::i64: return X86::MOV64rm;
13143  default:
13144    break;
13145  }
13146  llvm_unreachable("Invalid operand size!");
13147}
13148
13149// Get opcode of the non-atomic one from the specified atomic instruction.
13150static unsigned getNonAtomicOpcode(unsigned Opc) {
13151  switch (Opc) {
13152  case X86::ATOMAND8:  return X86::AND8rr;
13153  case X86::ATOMAND16: return X86::AND16rr;
13154  case X86::ATOMAND32: return X86::AND32rr;
13155  case X86::ATOMAND64: return X86::AND64rr;
13156  case X86::ATOMOR8:   return X86::OR8rr;
13157  case X86::ATOMOR16:  return X86::OR16rr;
13158  case X86::ATOMOR32:  return X86::OR32rr;
13159  case X86::ATOMOR64:  return X86::OR64rr;
13160  case X86::ATOMXOR8:  return X86::XOR8rr;
13161  case X86::ATOMXOR16: return X86::XOR16rr;
13162  case X86::ATOMXOR32: return X86::XOR32rr;
13163  case X86::ATOMXOR64: return X86::XOR64rr;
13164  }
13165  llvm_unreachable("Unhandled atomic-load-op opcode!");
13166}
13167
13168// Get opcode of the non-atomic one from the specified atomic instruction with
13169// extra opcode.
13170static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
13171                                               unsigned &ExtraOpc) {
13172  switch (Opc) {
13173  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
13174  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
13175  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
13176  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
13177  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
13178  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
13179  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
13180  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
13181  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
13182  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
13183  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
13184  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
13185  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
13186  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
13187  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
13188  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
13189  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
13190  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
13191  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
13192  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
13193  }
13194  llvm_unreachable("Unhandled atomic-load-op opcode!");
13195}
13196
13197// Get opcode of the non-atomic one from the specified atomic instruction for
13198// 64-bit data type on 32-bit target.
13199static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
13200  switch (Opc) {
13201  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
13202  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
13203  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
13204  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
13205  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
13206  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
13207  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
13208  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
13209  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
13210  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
13211  }
13212  llvm_unreachable("Unhandled atomic-load-op opcode!");
13213}
13214
13215// Get opcode of the non-atomic one from the specified atomic instruction for
13216// 64-bit data type on 32-bit target with extra opcode.
13217static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
13218                                                   unsigned &HiOpc,
13219                                                   unsigned &ExtraOpc) {
13220  switch (Opc) {
13221  case X86::ATOMNAND6432:
13222    ExtraOpc = X86::NOT32r;
13223    HiOpc = X86::AND32rr;
13224    return X86::AND32rr;
13225  }
13226  llvm_unreachable("Unhandled atomic-load-op opcode!");
13227}
13228
13229// Get pseudo CMOV opcode from the specified data type.
13230static unsigned getPseudoCMOVOpc(EVT VT) {
13231  switch (VT.getSimpleVT().SimpleTy) {
13232  case MVT::i8:  return X86::CMOV_GR8;
13233  case MVT::i16: return X86::CMOV_GR16;
13234  case MVT::i32: return X86::CMOV_GR32;
13235  default:
13236    break;
13237  }
13238  llvm_unreachable("Unknown CMOV opcode!");
13239}
13240
13241// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
13242// They will be translated into a spin-loop or compare-exchange loop from
13243//
13244//    ...
13245//    dst = atomic-fetch-op MI.addr, MI.val
13246//    ...
13247//
13248// to
13249//
13250//    ...
13251//    t1 = LOAD MI.addr
13252// loop:
13253//    t4 = phi(t1, t3 / loop)
13254//    t2 = OP MI.val, t4
13255//    EAX = t4
13256//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
13257//    t3 = EAX
13258//    JNE loop
13259// sink:
13260//    dst = t3
13261//    ...
13262MachineBasicBlock *
13263X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
13264                                       MachineBasicBlock *MBB) const {
13265  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13266  DebugLoc DL = MI->getDebugLoc();
13267
13268  MachineFunction *MF = MBB->getParent();
13269  MachineRegisterInfo &MRI = MF->getRegInfo();
13270
13271  const BasicBlock *BB = MBB->getBasicBlock();
13272  MachineFunction::iterator I = MBB;
13273  ++I;
13274
13275  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
13276         "Unexpected number of operands");
13277
13278  assert(MI->hasOneMemOperand() &&
13279         "Expected atomic-load-op to have one memoperand");
13280
13281  // Memory Reference
13282  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13283  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13284
13285  unsigned DstReg, SrcReg;
13286  unsigned MemOpndSlot;
13287
13288  unsigned CurOp = 0;
13289
13290  DstReg = MI->getOperand(CurOp++).getReg();
13291  MemOpndSlot = CurOp;
13292  CurOp += X86::AddrNumOperands;
13293  SrcReg = MI->getOperand(CurOp++).getReg();
13294
13295  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
13296  MVT::SimpleValueType VT = *RC->vt_begin();
13297  unsigned t1 = MRI.createVirtualRegister(RC);
13298  unsigned t2 = MRI.createVirtualRegister(RC);
13299  unsigned t3 = MRI.createVirtualRegister(RC);
13300  unsigned t4 = MRI.createVirtualRegister(RC);
13301  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
13302
13303  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
13304  unsigned LOADOpc = getLoadOpcode(VT);
13305
13306  // For the atomic load-arith operator, we generate
13307  //
13308  //  thisMBB:
13309  //    t1 = LOAD [MI.addr]
13310  //  mainMBB:
13311  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
13312  //    t1 = OP MI.val, EAX
13313  //    EAX = t4
13314  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
13315  //    t3 = EAX
13316  //    JNE mainMBB
13317  //  sinkMBB:
13318  //    dst = t3
13319
13320  MachineBasicBlock *thisMBB = MBB;
13321  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13322  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13323  MF->insert(I, mainMBB);
13324  MF->insert(I, sinkMBB);
13325
13326  MachineInstrBuilder MIB;
13327
13328  // Transfer the remainder of BB and its successor edges to sinkMBB.
13329  sinkMBB->splice(sinkMBB->begin(), MBB,
13330                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
13331  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
13332
13333  // thisMBB:
13334  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
13335  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13336    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
13337    if (NewMO.isReg())
13338      NewMO.setIsKill(false);
13339    MIB.addOperand(NewMO);
13340  }
13341  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
13342    unsigned flags = (*MMOI)->getFlags();
13343    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
13344    MachineMemOperand *MMO =
13345      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
13346                               (*MMOI)->getSize(),
13347                               (*MMOI)->getBaseAlignment(),
13348                               (*MMOI)->getTBAAInfo(),
13349                               (*MMOI)->getRanges());
13350    MIB.addMemOperand(MMO);
13351  }
13352
13353  thisMBB->addSuccessor(mainMBB);
13354
13355  // mainMBB:
13356  MachineBasicBlock *origMainMBB = mainMBB;
13357
13358  // Add a PHI.
13359  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
13360                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
13361
13362  unsigned Opc = MI->getOpcode();
13363  switch (Opc) {
13364  default:
13365    llvm_unreachable("Unhandled atomic-load-op opcode!");
13366  case X86::ATOMAND8:
13367  case X86::ATOMAND16:
13368  case X86::ATOMAND32:
13369  case X86::ATOMAND64:
13370  case X86::ATOMOR8:
13371  case X86::ATOMOR16:
13372  case X86::ATOMOR32:
13373  case X86::ATOMOR64:
13374  case X86::ATOMXOR8:
13375  case X86::ATOMXOR16:
13376  case X86::ATOMXOR32:
13377  case X86::ATOMXOR64: {
13378    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
13379    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
13380      .addReg(t4);
13381    break;
13382  }
13383  case X86::ATOMNAND8:
13384  case X86::ATOMNAND16:
13385  case X86::ATOMNAND32:
13386  case X86::ATOMNAND64: {
13387    unsigned Tmp = MRI.createVirtualRegister(RC);
13388    unsigned NOTOpc;
13389    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
13390    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
13391      .addReg(t4);
13392    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
13393    break;
13394  }
13395  case X86::ATOMMAX8:
13396  case X86::ATOMMAX16:
13397  case X86::ATOMMAX32:
13398  case X86::ATOMMAX64:
13399  case X86::ATOMMIN8:
13400  case X86::ATOMMIN16:
13401  case X86::ATOMMIN32:
13402  case X86::ATOMMIN64:
13403  case X86::ATOMUMAX8:
13404  case X86::ATOMUMAX16:
13405  case X86::ATOMUMAX32:
13406  case X86::ATOMUMAX64:
13407  case X86::ATOMUMIN8:
13408  case X86::ATOMUMIN16:
13409  case X86::ATOMUMIN32:
13410  case X86::ATOMUMIN64: {
13411    unsigned CMPOpc;
13412    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
13413
13414    BuildMI(mainMBB, DL, TII->get(CMPOpc))
13415      .addReg(SrcReg)
13416      .addReg(t4);
13417
13418    if (Subtarget->hasCMov()) {
13419      if (VT != MVT::i8) {
13420        // Native support
13421        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
13422          .addReg(SrcReg)
13423          .addReg(t4);
13424      } else {
13425        // Promote i8 to i32 to use CMOV32
13426        const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
13427        const TargetRegisterClass *RC32 =
13428          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
13429        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
13430        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
13431        unsigned Tmp = MRI.createVirtualRegister(RC32);
13432
13433        unsigned Undef = MRI.createVirtualRegister(RC32);
13434        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
13435
13436        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
13437          .addReg(Undef)
13438          .addReg(SrcReg)
13439          .addImm(X86::sub_8bit);
13440        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
13441          .addReg(Undef)
13442          .addReg(t4)
13443          .addImm(X86::sub_8bit);
13444
13445        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
13446          .addReg(SrcReg32)
13447          .addReg(AccReg32);
13448
13449        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
13450          .addReg(Tmp, 0, X86::sub_8bit);
13451      }
13452    } else {
13453      // Use pseudo select and lower them.
13454      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
13455             "Invalid atomic-load-op transformation!");
13456      unsigned SelOpc = getPseudoCMOVOpc(VT);
13457      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
13458      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
13459      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
13460              .addReg(SrcReg).addReg(t4)
13461              .addImm(CC);
13462      mainMBB = EmitLoweredSelect(MIB, mainMBB);
13463      // Replace the original PHI node as mainMBB is changed after CMOV
13464      // lowering.
13465      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
13466        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
13467      Phi->eraseFromParent();
13468    }
13469    break;
13470  }
13471  }
13472
13473  // Copy PhyReg back from virtual register.
13474  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
13475    .addReg(t4);
13476
13477  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
13478  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13479    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
13480    if (NewMO.isReg())
13481      NewMO.setIsKill(false);
13482    MIB.addOperand(NewMO);
13483  }
13484  MIB.addReg(t2);
13485  MIB.setMemRefs(MMOBegin, MMOEnd);
13486
13487  // Copy PhyReg back to virtual register.
13488  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
13489    .addReg(PhyReg);
13490
13491  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
13492
13493  mainMBB->addSuccessor(origMainMBB);
13494  mainMBB->addSuccessor(sinkMBB);
13495
13496  // sinkMBB:
13497  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13498          TII->get(TargetOpcode::COPY), DstReg)
13499    .addReg(t3);
13500
13501  MI->eraseFromParent();
13502  return sinkMBB;
13503}
13504
13505// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
13506// instructions. They will be translated into a spin-loop or compare-exchange
13507// loop from
13508//
13509//    ...
13510//    dst = atomic-fetch-op MI.addr, MI.val
13511//    ...
13512//
13513// to
13514//
13515//    ...
13516//    t1L = LOAD [MI.addr + 0]
13517//    t1H = LOAD [MI.addr + 4]
13518// loop:
13519//    t4L = phi(t1L, t3L / loop)
13520//    t4H = phi(t1H, t3H / loop)
13521//    t2L = OP MI.val.lo, t4L
13522//    t2H = OP MI.val.hi, t4H
13523//    EAX = t4L
13524//    EDX = t4H
13525//    EBX = t2L
13526//    ECX = t2H
13527//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
13528//    t3L = EAX
13529//    t3H = EDX
13530//    JNE loop
13531// sink:
13532//    dstL = t3L
13533//    dstH = t3H
13534//    ...
13535MachineBasicBlock *
13536X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
13537                                           MachineBasicBlock *MBB) const {
13538  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13539  DebugLoc DL = MI->getDebugLoc();
13540
13541  MachineFunction *MF = MBB->getParent();
13542  MachineRegisterInfo &MRI = MF->getRegInfo();
13543
13544  const BasicBlock *BB = MBB->getBasicBlock();
13545  MachineFunction::iterator I = MBB;
13546  ++I;
13547
13548  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
13549         "Unexpected number of operands");
13550
13551  assert(MI->hasOneMemOperand() &&
13552         "Expected atomic-load-op32 to have one memoperand");
13553
13554  // Memory Reference
13555  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13556  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13557
13558  unsigned DstLoReg, DstHiReg;
13559  unsigned SrcLoReg, SrcHiReg;
13560  unsigned MemOpndSlot;
13561
13562  unsigned CurOp = 0;
13563
13564  DstLoReg = MI->getOperand(CurOp++).getReg();
13565  DstHiReg = MI->getOperand(CurOp++).getReg();
13566  MemOpndSlot = CurOp;
13567  CurOp += X86::AddrNumOperands;
13568  SrcLoReg = MI->getOperand(CurOp++).getReg();
13569  SrcHiReg = MI->getOperand(CurOp++).getReg();
13570
13571  const TargetRegisterClass *RC = &X86::GR32RegClass;
13572  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
13573
13574  unsigned t1L = MRI.createVirtualRegister(RC);
13575  unsigned t1H = MRI.createVirtualRegister(RC);
13576  unsigned t2L = MRI.createVirtualRegister(RC);
13577  unsigned t2H = MRI.createVirtualRegister(RC);
13578  unsigned t3L = MRI.createVirtualRegister(RC);
13579  unsigned t3H = MRI.createVirtualRegister(RC);
13580  unsigned t4L = MRI.createVirtualRegister(RC);
13581  unsigned t4H = MRI.createVirtualRegister(RC);
13582
13583  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
13584  unsigned LOADOpc = X86::MOV32rm;
13585
13586  // For the atomic load-arith operator, we generate
13587  //
13588  //  thisMBB:
13589  //    t1L = LOAD [MI.addr + 0]
13590  //    t1H = LOAD [MI.addr + 4]
13591  //  mainMBB:
13592  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
13593  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
13594  //    t2L = OP MI.val.lo, t4L
13595  //    t2H = OP MI.val.hi, t4H
13596  //    EBX = t2L
13597  //    ECX = t2H
13598  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
13599  //    t3L = EAX
13600  //    t3H = EDX
13601  //    JNE loop
13602  //  sinkMBB:
13603  //    dstL = t3L
13604  //    dstH = t3H
13605
13606  MachineBasicBlock *thisMBB = MBB;
13607  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13608  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13609  MF->insert(I, mainMBB);
13610  MF->insert(I, sinkMBB);
13611
13612  MachineInstrBuilder MIB;
13613
13614  // Transfer the remainder of BB and its successor edges to sinkMBB.
13615  sinkMBB->splice(sinkMBB->begin(), MBB,
13616                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
13617  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
13618
13619  // thisMBB:
13620  // Lo
13621  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
13622  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13623    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
13624    if (NewMO.isReg())
13625      NewMO.setIsKill(false);
13626    MIB.addOperand(NewMO);
13627  }
13628  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
13629    unsigned flags = (*MMOI)->getFlags();
13630    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
13631    MachineMemOperand *MMO =
13632      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
13633                               (*MMOI)->getSize(),
13634                               (*MMOI)->getBaseAlignment(),
13635                               (*MMOI)->getTBAAInfo(),
13636                               (*MMOI)->getRanges());
13637    MIB.addMemOperand(MMO);
13638  };
13639  MachineInstr *LowMI = MIB;
13640
13641  // Hi
13642  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
13643  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13644    if (i == X86::AddrDisp) {
13645      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
13646    } else {
13647      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
13648      if (NewMO.isReg())
13649        NewMO.setIsKill(false);
13650      MIB.addOperand(NewMO);
13651    }
13652  }
13653  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
13654
13655  thisMBB->addSuccessor(mainMBB);
13656
13657  // mainMBB:
13658  MachineBasicBlock *origMainMBB = mainMBB;
13659
13660  // Add PHIs.
13661  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
13662                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
13663  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
13664                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
13665
13666  unsigned Opc = MI->getOpcode();
13667  switch (Opc) {
13668  default:
13669    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
13670  case X86::ATOMAND6432:
13671  case X86::ATOMOR6432:
13672  case X86::ATOMXOR6432:
13673  case X86::ATOMADD6432:
13674  case X86::ATOMSUB6432: {
13675    unsigned HiOpc;
13676    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
13677    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
13678      .addReg(SrcLoReg);
13679    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
13680      .addReg(SrcHiReg);
13681    break;
13682  }
13683  case X86::ATOMNAND6432: {
13684    unsigned HiOpc, NOTOpc;
13685    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
13686    unsigned TmpL = MRI.createVirtualRegister(RC);
13687    unsigned TmpH = MRI.createVirtualRegister(RC);
13688    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
13689      .addReg(t4L);
13690    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
13691      .addReg(t4H);
13692    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
13693    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
13694    break;
13695  }
13696  case X86::ATOMMAX6432:
13697  case X86::ATOMMIN6432:
13698  case X86::ATOMUMAX6432:
13699  case X86::ATOMUMIN6432: {
13700    unsigned HiOpc;
13701    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
13702    unsigned cL = MRI.createVirtualRegister(RC8);
13703    unsigned cH = MRI.createVirtualRegister(RC8);
13704    unsigned cL32 = MRI.createVirtualRegister(RC);
13705    unsigned cH32 = MRI.createVirtualRegister(RC);
13706    unsigned cc = MRI.createVirtualRegister(RC);
13707    // cl := cmp src_lo, lo
13708    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
13709      .addReg(SrcLoReg).addReg(t4L);
13710    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
13711    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
13712    // ch := cmp src_hi, hi
13713    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
13714      .addReg(SrcHiReg).addReg(t4H);
13715    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
13716    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
13717    // cc := if (src_hi == hi) ? cl : ch;
13718    if (Subtarget->hasCMov()) {
13719      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
13720        .addReg(cH32).addReg(cL32);
13721    } else {
13722      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
13723              .addReg(cH32).addReg(cL32)
13724              .addImm(X86::COND_E);
13725      mainMBB = EmitLoweredSelect(MIB, mainMBB);
13726    }
13727    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
13728    if (Subtarget->hasCMov()) {
13729      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
13730        .addReg(SrcLoReg).addReg(t4L);
13731      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
13732        .addReg(SrcHiReg).addReg(t4H);
13733    } else {
13734      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
13735              .addReg(SrcLoReg).addReg(t4L)
13736              .addImm(X86::COND_NE);
13737      mainMBB = EmitLoweredSelect(MIB, mainMBB);
13738      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
13739      // 2nd CMOV lowering.
13740      mainMBB->addLiveIn(X86::EFLAGS);
13741      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
13742              .addReg(SrcHiReg).addReg(t4H)
13743              .addImm(X86::COND_NE);
13744      mainMBB = EmitLoweredSelect(MIB, mainMBB);
13745      // Replace the original PHI node as mainMBB is changed after CMOV
13746      // lowering.
13747      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
13748        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
13749      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
13750        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
13751      PhiL->eraseFromParent();
13752      PhiH->eraseFromParent();
13753    }
13754    break;
13755  }
13756  case X86::ATOMSWAP6432: {
13757    unsigned HiOpc;
13758    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
13759    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
13760    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
13761    break;
13762  }
13763  }
13764
13765  // Copy EDX:EAX back from HiReg:LoReg
13766  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
13767  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
13768  // Copy ECX:EBX from t1H:t1L
13769  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
13770  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
13771
13772  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
13773  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13774    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
13775    if (NewMO.isReg())
13776      NewMO.setIsKill(false);
13777    MIB.addOperand(NewMO);
13778  }
13779  MIB.setMemRefs(MMOBegin, MMOEnd);
13780
13781  // Copy EDX:EAX back to t3H:t3L
13782  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
13783  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
13784
13785  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
13786
13787  mainMBB->addSuccessor(origMainMBB);
13788  mainMBB->addSuccessor(sinkMBB);
13789
13790  // sinkMBB:
13791  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13792          TII->get(TargetOpcode::COPY), DstLoReg)
13793    .addReg(t3L);
13794  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13795          TII->get(TargetOpcode::COPY), DstHiReg)
13796    .addReg(t3H);
13797
13798  MI->eraseFromParent();
13799  return sinkMBB;
13800}
13801
13802// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
13803// or XMM0_V32I8 in AVX all of this code can be replaced with that
13804// in the .td file.
13805static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
13806                                       const TargetInstrInfo *TII) {
13807  unsigned Opc;
13808  switch (MI->getOpcode()) {
13809  default: llvm_unreachable("illegal opcode!");
13810  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
13811  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
13812  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
13813  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
13814  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
13815  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
13816  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
13817  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
13818  }
13819
13820  DebugLoc dl = MI->getDebugLoc();
13821  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
13822
13823  unsigned NumArgs = MI->getNumOperands();
13824  for (unsigned i = 1; i < NumArgs; ++i) {
13825    MachineOperand &Op = MI->getOperand(i);
13826    if (!(Op.isReg() && Op.isImplicit()))
13827      MIB.addOperand(Op);
13828  }
13829  if (MI->hasOneMemOperand())
13830    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
13831
13832  BuildMI(*BB, MI, dl,
13833    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
13834    .addReg(X86::XMM0);
13835
13836  MI->eraseFromParent();
13837  return BB;
13838}
13839
13840// FIXME: Custom handling because TableGen doesn't support multiple implicit
13841// defs in an instruction pattern
13842static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
13843                                       const TargetInstrInfo *TII) {
13844  unsigned Opc;
13845  switch (MI->getOpcode()) {
13846  default: llvm_unreachable("illegal opcode!");
13847  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
13848  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
13849  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
13850  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
13851  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
13852  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
13853  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
13854  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
13855  }
13856
13857  DebugLoc dl = MI->getDebugLoc();
13858  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
13859
13860  unsigned NumArgs = MI->getNumOperands(); // remove the results
13861  for (unsigned i = 1; i < NumArgs; ++i) {
13862    MachineOperand &Op = MI->getOperand(i);
13863    if (!(Op.isReg() && Op.isImplicit()))
13864      MIB.addOperand(Op);
13865  }
13866  if (MI->hasOneMemOperand())
13867    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
13868
13869  BuildMI(*BB, MI, dl,
13870    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
13871    .addReg(X86::ECX);
13872
13873  MI->eraseFromParent();
13874  return BB;
13875}
13876
13877static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
13878                                       const TargetInstrInfo *TII,
13879                                       const X86Subtarget* Subtarget) {
13880  DebugLoc dl = MI->getDebugLoc();
13881
13882  // Address into RAX/EAX, other two args into ECX, EDX.
13883  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
13884  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13885  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
13886  for (int i = 0; i < X86::AddrNumOperands; ++i)
13887    MIB.addOperand(MI->getOperand(i));
13888
13889  unsigned ValOps = X86::AddrNumOperands;
13890  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
13891    .addReg(MI->getOperand(ValOps).getReg());
13892  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
13893    .addReg(MI->getOperand(ValOps+1).getReg());
13894
13895  // The instruction doesn't actually take any operands though.
13896  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
13897
13898  MI->eraseFromParent(); // The pseudo is gone now.
13899  return BB;
13900}
13901
13902MachineBasicBlock *
13903X86TargetLowering::EmitVAARG64WithCustomInserter(
13904                   MachineInstr *MI,
13905                   MachineBasicBlock *MBB) const {
13906  // Emit va_arg instruction on X86-64.
13907
13908  // Operands to this pseudo-instruction:
13909  // 0  ) Output        : destination address (reg)
13910  // 1-5) Input         : va_list address (addr, i64mem)
13911  // 6  ) ArgSize       : Size (in bytes) of vararg type
13912  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
13913  // 8  ) Align         : Alignment of type
13914  // 9  ) EFLAGS (implicit-def)
13915
13916  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
13917  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
13918
13919  unsigned DestReg = MI->getOperand(0).getReg();
13920  MachineOperand &Base = MI->getOperand(1);
13921  MachineOperand &Scale = MI->getOperand(2);
13922  MachineOperand &Index = MI->getOperand(3);
13923  MachineOperand &Disp = MI->getOperand(4);
13924  MachineOperand &Segment = MI->getOperand(5);
13925  unsigned ArgSize = MI->getOperand(6).getImm();
13926  unsigned ArgMode = MI->getOperand(7).getImm();
13927  unsigned Align = MI->getOperand(8).getImm();
13928
13929  // Memory Reference
13930  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
13931  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13932  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13933
13934  // Machine Information
13935  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13936  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
13937  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
13938  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
13939  DebugLoc DL = MI->getDebugLoc();
13940
13941  // struct va_list {
13942  //   i32   gp_offset
13943  //   i32   fp_offset
13944  //   i64   overflow_area (address)
13945  //   i64   reg_save_area (address)
13946  // }
13947  // sizeof(va_list) = 24
13948  // alignment(va_list) = 8
13949
13950  unsigned TotalNumIntRegs = 6;
13951  unsigned TotalNumXMMRegs = 8;
13952  bool UseGPOffset = (ArgMode == 1);
13953  bool UseFPOffset = (ArgMode == 2);
13954  unsigned MaxOffset = TotalNumIntRegs * 8 +
13955                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
13956
13957  /* Align ArgSize to a multiple of 8 */
13958  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
13959  bool NeedsAlign = (Align > 8);
13960
13961  MachineBasicBlock *thisMBB = MBB;
13962  MachineBasicBlock *overflowMBB;
13963  MachineBasicBlock *offsetMBB;
13964  MachineBasicBlock *endMBB;
13965
13966  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
13967  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
13968  unsigned OffsetReg = 0;
13969
13970  if (!UseGPOffset && !UseFPOffset) {
13971    // If we only pull from the overflow region, we don't create a branch.
13972    // We don't need to alter control flow.
13973    OffsetDestReg = 0; // unused
13974    OverflowDestReg = DestReg;
13975
13976    offsetMBB = NULL;
13977    overflowMBB = thisMBB;
13978    endMBB = thisMBB;
13979  } else {
13980    // First emit code to check if gp_offset (or fp_offset) is below the bound.
13981    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
13982    // If not, pull from overflow_area. (branch to overflowMBB)
13983    //
13984    //       thisMBB
13985    //         |     .
13986    //         |        .
13987    //     offsetMBB   overflowMBB
13988    //         |        .
13989    //         |     .
13990    //        endMBB
13991
13992    // Registers for the PHI in endMBB
13993    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
13994    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
13995
13996    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
13997    MachineFunction *MF = MBB->getParent();
13998    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13999    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14000    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14001
14002    MachineFunction::iterator MBBIter = MBB;
14003    ++MBBIter;
14004
14005    // Insert the new basic blocks
14006    MF->insert(MBBIter, offsetMBB);
14007    MF->insert(MBBIter, overflowMBB);
14008    MF->insert(MBBIter, endMBB);
14009
14010    // Transfer the remainder of MBB and its successor edges to endMBB.
14011    endMBB->splice(endMBB->begin(), thisMBB,
14012                    llvm::next(MachineBasicBlock::iterator(MI)),
14013                    thisMBB->end());
14014    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
14015
14016    // Make offsetMBB and overflowMBB successors of thisMBB
14017    thisMBB->addSuccessor(offsetMBB);
14018    thisMBB->addSuccessor(overflowMBB);
14019
14020    // endMBB is a successor of both offsetMBB and overflowMBB
14021    offsetMBB->addSuccessor(endMBB);
14022    overflowMBB->addSuccessor(endMBB);
14023
14024    // Load the offset value into a register
14025    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
14026    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
14027      .addOperand(Base)
14028      .addOperand(Scale)
14029      .addOperand(Index)
14030      .addDisp(Disp, UseFPOffset ? 4 : 0)
14031      .addOperand(Segment)
14032      .setMemRefs(MMOBegin, MMOEnd);
14033
14034    // Check if there is enough room left to pull this argument.
14035    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
14036      .addReg(OffsetReg)
14037      .addImm(MaxOffset + 8 - ArgSizeA8);
14038
14039    // Branch to "overflowMBB" if offset >= max
14040    // Fall through to "offsetMBB" otherwise
14041    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
14042      .addMBB(overflowMBB);
14043  }
14044
14045  // In offsetMBB, emit code to use the reg_save_area.
14046  if (offsetMBB) {
14047    assert(OffsetReg != 0);
14048
14049    // Read the reg_save_area address.
14050    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
14051    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
14052      .addOperand(Base)
14053      .addOperand(Scale)
14054      .addOperand(Index)
14055      .addDisp(Disp, 16)
14056      .addOperand(Segment)
14057      .setMemRefs(MMOBegin, MMOEnd);
14058
14059    // Zero-extend the offset
14060    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
14061      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
14062        .addImm(0)
14063        .addReg(OffsetReg)
14064        .addImm(X86::sub_32bit);
14065
14066    // Add the offset to the reg_save_area to get the final address.
14067    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
14068      .addReg(OffsetReg64)
14069      .addReg(RegSaveReg);
14070
14071    // Compute the offset for the next argument
14072    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
14073    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
14074      .addReg(OffsetReg)
14075      .addImm(UseFPOffset ? 16 : 8);
14076
14077    // Store it back into the va_list.
14078    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
14079      .addOperand(Base)
14080      .addOperand(Scale)
14081      .addOperand(Index)
14082      .addDisp(Disp, UseFPOffset ? 4 : 0)
14083      .addOperand(Segment)
14084      .addReg(NextOffsetReg)
14085      .setMemRefs(MMOBegin, MMOEnd);
14086
14087    // Jump to endMBB
14088    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
14089      .addMBB(endMBB);
14090  }
14091
14092  //
14093  // Emit code to use overflow area
14094  //
14095
14096  // Load the overflow_area address into a register.
14097  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
14098  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
14099    .addOperand(Base)
14100    .addOperand(Scale)
14101    .addOperand(Index)
14102    .addDisp(Disp, 8)
14103    .addOperand(Segment)
14104    .setMemRefs(MMOBegin, MMOEnd);
14105
14106  // If we need to align it, do so. Otherwise, just copy the address
14107  // to OverflowDestReg.
14108  if (NeedsAlign) {
14109    // Align the overflow address
14110    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
14111    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
14112
14113    // aligned_addr = (addr + (align-1)) & ~(align-1)
14114    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
14115      .addReg(OverflowAddrReg)
14116      .addImm(Align-1);
14117
14118    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
14119      .addReg(TmpReg)
14120      .addImm(~(uint64_t)(Align-1));
14121  } else {
14122    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
14123      .addReg(OverflowAddrReg);
14124  }
14125
14126  // Compute the next overflow address after this argument.
14127  // (the overflow address should be kept 8-byte aligned)
14128  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
14129  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
14130    .addReg(OverflowDestReg)
14131    .addImm(ArgSizeA8);
14132
14133  // Store the new overflow address.
14134  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
14135    .addOperand(Base)
14136    .addOperand(Scale)
14137    .addOperand(Index)
14138    .addDisp(Disp, 8)
14139    .addOperand(Segment)
14140    .addReg(NextAddrReg)
14141    .setMemRefs(MMOBegin, MMOEnd);
14142
14143  // If we branched, emit the PHI to the front of endMBB.
14144  if (offsetMBB) {
14145    BuildMI(*endMBB, endMBB->begin(), DL,
14146            TII->get(X86::PHI), DestReg)
14147      .addReg(OffsetDestReg).addMBB(offsetMBB)
14148      .addReg(OverflowDestReg).addMBB(overflowMBB);
14149  }
14150
14151  // Erase the pseudo instruction
14152  MI->eraseFromParent();
14153
14154  return endMBB;
14155}
14156
14157MachineBasicBlock *
14158X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
14159                                                 MachineInstr *MI,
14160                                                 MachineBasicBlock *MBB) const {
14161  // Emit code to save XMM registers to the stack. The ABI says that the
14162  // number of registers to save is given in %al, so it's theoretically
14163  // possible to do an indirect jump trick to avoid saving all of them,
14164  // however this code takes a simpler approach and just executes all
14165  // of the stores if %al is non-zero. It's less code, and it's probably
14166  // easier on the hardware branch predictor, and stores aren't all that
14167  // expensive anyway.
14168
14169  // Create the new basic blocks. One block contains all the XMM stores,
14170  // and one block is the final destination regardless of whether any
14171  // stores were performed.
14172  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
14173  MachineFunction *F = MBB->getParent();
14174  MachineFunction::iterator MBBIter = MBB;
14175  ++MBBIter;
14176  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
14177  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
14178  F->insert(MBBIter, XMMSaveMBB);
14179  F->insert(MBBIter, EndMBB);
14180
14181  // Transfer the remainder of MBB and its successor edges to EndMBB.
14182  EndMBB->splice(EndMBB->begin(), MBB,
14183                 llvm::next(MachineBasicBlock::iterator(MI)),
14184                 MBB->end());
14185  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
14186
14187  // The original block will now fall through to the XMM save block.
14188  MBB->addSuccessor(XMMSaveMBB);
14189  // The XMMSaveMBB will fall through to the end block.
14190  XMMSaveMBB->addSuccessor(EndMBB);
14191
14192  // Now add the instructions.
14193  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14194  DebugLoc DL = MI->getDebugLoc();
14195
14196  unsigned CountReg = MI->getOperand(0).getReg();
14197  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
14198  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
14199
14200  if (!Subtarget->isTargetWin64()) {
14201    // If %al is 0, branch around the XMM save block.
14202    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
14203    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
14204    MBB->addSuccessor(EndMBB);
14205  }
14206
14207  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
14208  // In the XMM save block, save all the XMM argument registers.
14209  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
14210    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
14211    MachineMemOperand *MMO =
14212      F->getMachineMemOperand(
14213          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
14214        MachineMemOperand::MOStore,
14215        /*Size=*/16, /*Align=*/16);
14216    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
14217      .addFrameIndex(RegSaveFrameIndex)
14218      .addImm(/*Scale=*/1)
14219      .addReg(/*IndexReg=*/0)
14220      .addImm(/*Disp=*/Offset)
14221      .addReg(/*Segment=*/0)
14222      .addReg(MI->getOperand(i).getReg())
14223      .addMemOperand(MMO);
14224  }
14225
14226  MI->eraseFromParent();   // The pseudo instruction is gone now.
14227
14228  return EndMBB;
14229}
14230
14231// The EFLAGS operand of SelectItr might be missing a kill marker
14232// because there were multiple uses of EFLAGS, and ISel didn't know
14233// which to mark. Figure out whether SelectItr should have had a
14234// kill marker, and set it if it should. Returns the correct kill
14235// marker value.
14236static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
14237                                     MachineBasicBlock* BB,
14238                                     const TargetRegisterInfo* TRI) {
14239  // Scan forward through BB for a use/def of EFLAGS.
14240  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
14241  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
14242    const MachineInstr& mi = *miI;
14243    if (mi.readsRegister(X86::EFLAGS))
14244      return false;
14245    if (mi.definesRegister(X86::EFLAGS))
14246      break; // Should have kill-flag - update below.
14247  }
14248
14249  // If we hit the end of the block, check whether EFLAGS is live into a
14250  // successor.
14251  if (miI == BB->end()) {
14252    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
14253                                          sEnd = BB->succ_end();
14254         sItr != sEnd; ++sItr) {
14255      MachineBasicBlock* succ = *sItr;
14256      if (succ->isLiveIn(X86::EFLAGS))
14257        return false;
14258    }
14259  }
14260
14261  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
14262  // out. SelectMI should have a kill flag on EFLAGS.
14263  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
14264  return true;
14265}
14266
14267MachineBasicBlock *
14268X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
14269                                     MachineBasicBlock *BB) const {
14270  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14271  DebugLoc DL = MI->getDebugLoc();
14272
14273  // To "insert" a SELECT_CC instruction, we actually have to insert the
14274  // diamond control-flow pattern.  The incoming instruction knows the
14275  // destination vreg to set, the condition code register to branch on, the
14276  // true/false values to select between, and a branch opcode to use.
14277  const BasicBlock *LLVM_BB = BB->getBasicBlock();
14278  MachineFunction::iterator It = BB;
14279  ++It;
14280
14281  //  thisMBB:
14282  //  ...
14283  //   TrueVal = ...
14284  //   cmpTY ccX, r1, r2
14285  //   bCC copy1MBB
14286  //   fallthrough --> copy0MBB
14287  MachineBasicBlock *thisMBB = BB;
14288  MachineFunction *F = BB->getParent();
14289  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
14290  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
14291  F->insert(It, copy0MBB);
14292  F->insert(It, sinkMBB);
14293
14294  // If the EFLAGS register isn't dead in the terminator, then claim that it's
14295  // live into the sink and copy blocks.
14296  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
14297  if (!MI->killsRegister(X86::EFLAGS) &&
14298      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
14299    copy0MBB->addLiveIn(X86::EFLAGS);
14300    sinkMBB->addLiveIn(X86::EFLAGS);
14301  }
14302
14303  // Transfer the remainder of BB and its successor edges to sinkMBB.
14304  sinkMBB->splice(sinkMBB->begin(), BB,
14305                  llvm::next(MachineBasicBlock::iterator(MI)),
14306                  BB->end());
14307  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
14308
14309  // Add the true and fallthrough blocks as its successors.
14310  BB->addSuccessor(copy0MBB);
14311  BB->addSuccessor(sinkMBB);
14312
14313  // Create the conditional branch instruction.
14314  unsigned Opc =
14315    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
14316  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
14317
14318  //  copy0MBB:
14319  //   %FalseValue = ...
14320  //   # fallthrough to sinkMBB
14321  copy0MBB->addSuccessor(sinkMBB);
14322
14323  //  sinkMBB:
14324  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
14325  //  ...
14326  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14327          TII->get(X86::PHI), MI->getOperand(0).getReg())
14328    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
14329    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
14330
14331  MI->eraseFromParent();   // The pseudo instruction is gone now.
14332  return sinkMBB;
14333}
14334
14335MachineBasicBlock *
14336X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
14337                                        bool Is64Bit) const {
14338  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14339  DebugLoc DL = MI->getDebugLoc();
14340  MachineFunction *MF = BB->getParent();
14341  const BasicBlock *LLVM_BB = BB->getBasicBlock();
14342
14343  assert(getTargetMachine().Options.EnableSegmentedStacks);
14344
14345  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
14346  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
14347
14348  // BB:
14349  //  ... [Till the alloca]
14350  // If stacklet is not large enough, jump to mallocMBB
14351  //
14352  // bumpMBB:
14353  //  Allocate by subtracting from RSP
14354  //  Jump to continueMBB
14355  //
14356  // mallocMBB:
14357  //  Allocate by call to runtime
14358  //
14359  // continueMBB:
14360  //  ...
14361  //  [rest of original BB]
14362  //
14363
14364  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14365  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14366  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14367
14368  MachineRegisterInfo &MRI = MF->getRegInfo();
14369  const TargetRegisterClass *AddrRegClass =
14370    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
14371
14372  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
14373    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
14374    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
14375    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
14376    sizeVReg = MI->getOperand(1).getReg(),
14377    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
14378
14379  MachineFunction::iterator MBBIter = BB;
14380  ++MBBIter;
14381
14382  MF->insert(MBBIter, bumpMBB);
14383  MF->insert(MBBIter, mallocMBB);
14384  MF->insert(MBBIter, continueMBB);
14385
14386  continueMBB->splice(continueMBB->begin(), BB, llvm::next
14387                      (MachineBasicBlock::iterator(MI)), BB->end());
14388  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
14389
14390  // Add code to the main basic block to check if the stack limit has been hit,
14391  // and if so, jump to mallocMBB otherwise to bumpMBB.
14392  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
14393  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
14394    .addReg(tmpSPVReg).addReg(sizeVReg);
14395  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
14396    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
14397    .addReg(SPLimitVReg);
14398  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
14399
14400  // bumpMBB simply decreases the stack pointer, since we know the current
14401  // stacklet has enough space.
14402  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
14403    .addReg(SPLimitVReg);
14404  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
14405    .addReg(SPLimitVReg);
14406  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
14407
14408  // Calls into a routine in libgcc to allocate more space from the heap.
14409  const uint32_t *RegMask =
14410    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
14411  if (Is64Bit) {
14412    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
14413      .addReg(sizeVReg);
14414    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
14415      .addExternalSymbol("__morestack_allocate_stack_space")
14416      .addRegMask(RegMask)
14417      .addReg(X86::RDI, RegState::Implicit)
14418      .addReg(X86::RAX, RegState::ImplicitDefine);
14419  } else {
14420    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
14421      .addImm(12);
14422    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
14423    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
14424      .addExternalSymbol("__morestack_allocate_stack_space")
14425      .addRegMask(RegMask)
14426      .addReg(X86::EAX, RegState::ImplicitDefine);
14427  }
14428
14429  if (!Is64Bit)
14430    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
14431      .addImm(16);
14432
14433  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
14434    .addReg(Is64Bit ? X86::RAX : X86::EAX);
14435  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
14436
14437  // Set up the CFG correctly.
14438  BB->addSuccessor(bumpMBB);
14439  BB->addSuccessor(mallocMBB);
14440  mallocMBB->addSuccessor(continueMBB);
14441  bumpMBB->addSuccessor(continueMBB);
14442
14443  // Take care of the PHI nodes.
14444  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
14445          MI->getOperand(0).getReg())
14446    .addReg(mallocPtrVReg).addMBB(mallocMBB)
14447    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
14448
14449  // Delete the original pseudo instruction.
14450  MI->eraseFromParent();
14451
14452  // And we're done.
14453  return continueMBB;
14454}
14455
14456MachineBasicBlock *
14457X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
14458                                          MachineBasicBlock *BB) const {
14459  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14460  DebugLoc DL = MI->getDebugLoc();
14461
14462  assert(!Subtarget->isTargetEnvMacho());
14463
14464  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
14465  // non-trivial part is impdef of ESP.
14466
14467  if (Subtarget->isTargetWin64()) {
14468    if (Subtarget->isTargetCygMing()) {
14469      // ___chkstk(Mingw64):
14470      // Clobbers R10, R11, RAX and EFLAGS.
14471      // Updates RSP.
14472      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
14473        .addExternalSymbol("___chkstk")
14474        .addReg(X86::RAX, RegState::Implicit)
14475        .addReg(X86::RSP, RegState::Implicit)
14476        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
14477        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
14478        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
14479    } else {
14480      // __chkstk(MSVCRT): does not update stack pointer.
14481      // Clobbers R10, R11 and EFLAGS.
14482      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
14483        .addExternalSymbol("__chkstk")
14484        .addReg(X86::RAX, RegState::Implicit)
14485        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
14486      // RAX has the offset to be subtracted from RSP.
14487      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
14488        .addReg(X86::RSP)
14489        .addReg(X86::RAX);
14490    }
14491  } else {
14492    const char *StackProbeSymbol =
14493      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
14494
14495    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
14496      .addExternalSymbol(StackProbeSymbol)
14497      .addReg(X86::EAX, RegState::Implicit)
14498      .addReg(X86::ESP, RegState::Implicit)
14499      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
14500      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
14501      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
14502  }
14503
14504  MI->eraseFromParent();   // The pseudo instruction is gone now.
14505  return BB;
14506}
14507
14508MachineBasicBlock *
14509X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
14510                                      MachineBasicBlock *BB) const {
14511  // This is pretty easy.  We're taking the value that we received from
14512  // our load from the relocation, sticking it in either RDI (x86-64)
14513  // or EAX and doing an indirect call.  The return value will then
14514  // be in the normal return register.
14515  const X86InstrInfo *TII
14516    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
14517  DebugLoc DL = MI->getDebugLoc();
14518  MachineFunction *F = BB->getParent();
14519
14520  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
14521  assert(MI->getOperand(3).isGlobal() && "This should be a global");
14522
14523  // Get a register mask for the lowered call.
14524  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
14525  // proper register mask.
14526  const uint32_t *RegMask =
14527    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
14528  if (Subtarget->is64Bit()) {
14529    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
14530                                      TII->get(X86::MOV64rm), X86::RDI)
14531    .addReg(X86::RIP)
14532    .addImm(0).addReg(0)
14533    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
14534                      MI->getOperand(3).getTargetFlags())
14535    .addReg(0);
14536    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
14537    addDirectMem(MIB, X86::RDI);
14538    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
14539  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
14540    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
14541                                      TII->get(X86::MOV32rm), X86::EAX)
14542    .addReg(0)
14543    .addImm(0).addReg(0)
14544    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
14545                      MI->getOperand(3).getTargetFlags())
14546    .addReg(0);
14547    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
14548    addDirectMem(MIB, X86::EAX);
14549    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
14550  } else {
14551    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
14552                                      TII->get(X86::MOV32rm), X86::EAX)
14553    .addReg(TII->getGlobalBaseReg(F))
14554    .addImm(0).addReg(0)
14555    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
14556                      MI->getOperand(3).getTargetFlags())
14557    .addReg(0);
14558    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
14559    addDirectMem(MIB, X86::EAX);
14560    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
14561  }
14562
14563  MI->eraseFromParent(); // The pseudo instruction is gone now.
14564  return BB;
14565}
14566
14567MachineBasicBlock *
14568X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
14569                                    MachineBasicBlock *MBB) const {
14570  DebugLoc DL = MI->getDebugLoc();
14571  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14572
14573  MachineFunction *MF = MBB->getParent();
14574  MachineRegisterInfo &MRI = MF->getRegInfo();
14575
14576  const BasicBlock *BB = MBB->getBasicBlock();
14577  MachineFunction::iterator I = MBB;
14578  ++I;
14579
14580  // Memory Reference
14581  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
14582  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
14583
14584  unsigned DstReg;
14585  unsigned MemOpndSlot = 0;
14586
14587  unsigned CurOp = 0;
14588
14589  DstReg = MI->getOperand(CurOp++).getReg();
14590  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
14591  assert(RC->hasType(MVT::i32) && "Invalid destination!");
14592  unsigned mainDstReg = MRI.createVirtualRegister(RC);
14593  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
14594
14595  MemOpndSlot = CurOp;
14596
14597  MVT PVT = getPointerTy();
14598  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
14599         "Invalid Pointer Size!");
14600
14601  // For v = setjmp(buf), we generate
14602  //
14603  // thisMBB:
14604  //  buf[LabelOffset] = restoreMBB
14605  //  SjLjSetup restoreMBB
14606  //
14607  // mainMBB:
14608  //  v_main = 0
14609  //
14610  // sinkMBB:
14611  //  v = phi(main, restore)
14612  //
14613  // restoreMBB:
14614  //  v_restore = 1
14615
14616  MachineBasicBlock *thisMBB = MBB;
14617  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14618  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14619  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
14620  MF->insert(I, mainMBB);
14621  MF->insert(I, sinkMBB);
14622  MF->push_back(restoreMBB);
14623
14624  MachineInstrBuilder MIB;
14625
14626  // Transfer the remainder of BB and its successor edges to sinkMBB.
14627  sinkMBB->splice(sinkMBB->begin(), MBB,
14628                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
14629  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14630
14631  // thisMBB:
14632  unsigned PtrStoreOpc = 0;
14633  unsigned LabelReg = 0;
14634  const int64_t LabelOffset = 1 * PVT.getStoreSize();
14635  Reloc::Model RM = getTargetMachine().getRelocationModel();
14636  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
14637                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
14638
14639  // Prepare IP either in reg or imm.
14640  if (!UseImmLabel) {
14641    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
14642    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
14643    LabelReg = MRI.createVirtualRegister(PtrRC);
14644    if (Subtarget->is64Bit()) {
14645      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
14646              .addReg(X86::RIP)
14647              .addImm(0)
14648              .addReg(0)
14649              .addMBB(restoreMBB)
14650              .addReg(0);
14651    } else {
14652      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
14653      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
14654              .addReg(XII->getGlobalBaseReg(MF))
14655              .addImm(0)
14656              .addReg(0)
14657              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
14658              .addReg(0);
14659    }
14660  } else
14661    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
14662  // Store IP
14663  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
14664  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14665    if (i == X86::AddrDisp)
14666      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
14667    else
14668      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
14669  }
14670  if (!UseImmLabel)
14671    MIB.addReg(LabelReg);
14672  else
14673    MIB.addMBB(restoreMBB);
14674  MIB.setMemRefs(MMOBegin, MMOEnd);
14675  // Setup
14676  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
14677          .addMBB(restoreMBB);
14678
14679  const X86RegisterInfo *RegInfo =
14680    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
14681  MIB.addRegMask(RegInfo->getNoPreservedMask());
14682  thisMBB->addSuccessor(mainMBB);
14683  thisMBB->addSuccessor(restoreMBB);
14684
14685  // mainMBB:
14686  //  EAX = 0
14687  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
14688  mainMBB->addSuccessor(sinkMBB);
14689
14690  // sinkMBB:
14691  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14692          TII->get(X86::PHI), DstReg)
14693    .addReg(mainDstReg).addMBB(mainMBB)
14694    .addReg(restoreDstReg).addMBB(restoreMBB);
14695
14696  // restoreMBB:
14697  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
14698  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
14699  restoreMBB->addSuccessor(sinkMBB);
14700
14701  MI->eraseFromParent();
14702  return sinkMBB;
14703}
14704
14705MachineBasicBlock *
14706X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
14707                                     MachineBasicBlock *MBB) const {
14708  DebugLoc DL = MI->getDebugLoc();
14709  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14710
14711  MachineFunction *MF = MBB->getParent();
14712  MachineRegisterInfo &MRI = MF->getRegInfo();
14713
14714  // Memory Reference
14715  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
14716  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
14717
14718  MVT PVT = getPointerTy();
14719  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
14720         "Invalid Pointer Size!");
14721
14722  const TargetRegisterClass *RC =
14723    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
14724  unsigned Tmp = MRI.createVirtualRegister(RC);
14725  // Since FP is only updated here but NOT referenced, it's treated as GPR.
14726  const X86RegisterInfo *RegInfo =
14727    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
14728  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
14729  unsigned SP = RegInfo->getStackRegister();
14730
14731  MachineInstrBuilder MIB;
14732
14733  const int64_t LabelOffset = 1 * PVT.getStoreSize();
14734  const int64_t SPOffset = 2 * PVT.getStoreSize();
14735
14736  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
14737  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
14738
14739  // Reload FP
14740  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
14741  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
14742    MIB.addOperand(MI->getOperand(i));
14743  MIB.setMemRefs(MMOBegin, MMOEnd);
14744  // Reload IP
14745  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
14746  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14747    if (i == X86::AddrDisp)
14748      MIB.addDisp(MI->getOperand(i), LabelOffset);
14749    else
14750      MIB.addOperand(MI->getOperand(i));
14751  }
14752  MIB.setMemRefs(MMOBegin, MMOEnd);
14753  // Reload SP
14754  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
14755  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14756    if (i == X86::AddrDisp)
14757      MIB.addDisp(MI->getOperand(i), SPOffset);
14758    else
14759      MIB.addOperand(MI->getOperand(i));
14760  }
14761  MIB.setMemRefs(MMOBegin, MMOEnd);
14762  // Jump
14763  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
14764
14765  MI->eraseFromParent();
14766  return MBB;
14767}
14768
14769MachineBasicBlock *
14770X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
14771                                               MachineBasicBlock *BB) const {
14772  switch (MI->getOpcode()) {
14773  default: llvm_unreachable("Unexpected instr type to insert");
14774  case X86::TAILJMPd64:
14775  case X86::TAILJMPr64:
14776  case X86::TAILJMPm64:
14777    llvm_unreachable("TAILJMP64 would not be touched here.");
14778  case X86::TCRETURNdi64:
14779  case X86::TCRETURNri64:
14780  case X86::TCRETURNmi64:
14781    return BB;
14782  case X86::WIN_ALLOCA:
14783    return EmitLoweredWinAlloca(MI, BB);
14784  case X86::SEG_ALLOCA_32:
14785    return EmitLoweredSegAlloca(MI, BB, false);
14786  case X86::SEG_ALLOCA_64:
14787    return EmitLoweredSegAlloca(MI, BB, true);
14788  case X86::TLSCall_32:
14789  case X86::TLSCall_64:
14790    return EmitLoweredTLSCall(MI, BB);
14791  case X86::CMOV_GR8:
14792  case X86::CMOV_FR32:
14793  case X86::CMOV_FR64:
14794  case X86::CMOV_V4F32:
14795  case X86::CMOV_V2F64:
14796  case X86::CMOV_V2I64:
14797  case X86::CMOV_V8F32:
14798  case X86::CMOV_V4F64:
14799  case X86::CMOV_V4I64:
14800  case X86::CMOV_GR16:
14801  case X86::CMOV_GR32:
14802  case X86::CMOV_RFP32:
14803  case X86::CMOV_RFP64:
14804  case X86::CMOV_RFP80:
14805    return EmitLoweredSelect(MI, BB);
14806
14807  case X86::FP32_TO_INT16_IN_MEM:
14808  case X86::FP32_TO_INT32_IN_MEM:
14809  case X86::FP32_TO_INT64_IN_MEM:
14810  case X86::FP64_TO_INT16_IN_MEM:
14811  case X86::FP64_TO_INT32_IN_MEM:
14812  case X86::FP64_TO_INT64_IN_MEM:
14813  case X86::FP80_TO_INT16_IN_MEM:
14814  case X86::FP80_TO_INT32_IN_MEM:
14815  case X86::FP80_TO_INT64_IN_MEM: {
14816    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14817    DebugLoc DL = MI->getDebugLoc();
14818
14819    // Change the floating point control register to use "round towards zero"
14820    // mode when truncating to an integer value.
14821    MachineFunction *F = BB->getParent();
14822    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
14823    addFrameReference(BuildMI(*BB, MI, DL,
14824                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
14825
14826    // Load the old value of the high byte of the control word...
14827    unsigned OldCW =
14828      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
14829    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
14830                      CWFrameIdx);
14831
14832    // Set the high part to be round to zero...
14833    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
14834      .addImm(0xC7F);
14835
14836    // Reload the modified control word now...
14837    addFrameReference(BuildMI(*BB, MI, DL,
14838                              TII->get(X86::FLDCW16m)), CWFrameIdx);
14839
14840    // Restore the memory image of control word to original value
14841    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
14842      .addReg(OldCW);
14843
14844    // Get the X86 opcode to use.
14845    unsigned Opc;
14846    switch (MI->getOpcode()) {
14847    default: llvm_unreachable("illegal opcode!");
14848    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
14849    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
14850    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
14851    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
14852    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
14853    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
14854    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
14855    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
14856    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
14857    }
14858
14859    X86AddressMode AM;
14860    MachineOperand &Op = MI->getOperand(0);
14861    if (Op.isReg()) {
14862      AM.BaseType = X86AddressMode::RegBase;
14863      AM.Base.Reg = Op.getReg();
14864    } else {
14865      AM.BaseType = X86AddressMode::FrameIndexBase;
14866      AM.Base.FrameIndex = Op.getIndex();
14867    }
14868    Op = MI->getOperand(1);
14869    if (Op.isImm())
14870      AM.Scale = Op.getImm();
14871    Op = MI->getOperand(2);
14872    if (Op.isImm())
14873      AM.IndexReg = Op.getImm();
14874    Op = MI->getOperand(3);
14875    if (Op.isGlobal()) {
14876      AM.GV = Op.getGlobal();
14877    } else {
14878      AM.Disp = Op.getImm();
14879    }
14880    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
14881                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
14882
14883    // Reload the original control word now.
14884    addFrameReference(BuildMI(*BB, MI, DL,
14885                              TII->get(X86::FLDCW16m)), CWFrameIdx);
14886
14887    MI->eraseFromParent();   // The pseudo instruction is gone now.
14888    return BB;
14889  }
14890    // String/text processing lowering.
14891  case X86::PCMPISTRM128REG:
14892  case X86::VPCMPISTRM128REG:
14893  case X86::PCMPISTRM128MEM:
14894  case X86::VPCMPISTRM128MEM:
14895  case X86::PCMPESTRM128REG:
14896  case X86::VPCMPESTRM128REG:
14897  case X86::PCMPESTRM128MEM:
14898  case X86::VPCMPESTRM128MEM:
14899    assert(Subtarget->hasSSE42() &&
14900           "Target must have SSE4.2 or AVX features enabled");
14901    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
14902
14903  // String/text processing lowering.
14904  case X86::PCMPISTRIREG:
14905  case X86::VPCMPISTRIREG:
14906  case X86::PCMPISTRIMEM:
14907  case X86::VPCMPISTRIMEM:
14908  case X86::PCMPESTRIREG:
14909  case X86::VPCMPESTRIREG:
14910  case X86::PCMPESTRIMEM:
14911  case X86::VPCMPESTRIMEM:
14912    assert(Subtarget->hasSSE42() &&
14913           "Target must have SSE4.2 or AVX features enabled");
14914    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
14915
14916  // Thread synchronization.
14917  case X86::MONITOR:
14918    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
14919
14920  // xbegin
14921  case X86::XBEGIN:
14922    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
14923
14924  // Atomic Lowering.
14925  case X86::ATOMAND8:
14926  case X86::ATOMAND16:
14927  case X86::ATOMAND32:
14928  case X86::ATOMAND64:
14929    // Fall through
14930  case X86::ATOMOR8:
14931  case X86::ATOMOR16:
14932  case X86::ATOMOR32:
14933  case X86::ATOMOR64:
14934    // Fall through
14935  case X86::ATOMXOR16:
14936  case X86::ATOMXOR8:
14937  case X86::ATOMXOR32:
14938  case X86::ATOMXOR64:
14939    // Fall through
14940  case X86::ATOMNAND8:
14941  case X86::ATOMNAND16:
14942  case X86::ATOMNAND32:
14943  case X86::ATOMNAND64:
14944    // Fall through
14945  case X86::ATOMMAX8:
14946  case X86::ATOMMAX16:
14947  case X86::ATOMMAX32:
14948  case X86::ATOMMAX64:
14949    // Fall through
14950  case X86::ATOMMIN8:
14951  case X86::ATOMMIN16:
14952  case X86::ATOMMIN32:
14953  case X86::ATOMMIN64:
14954    // Fall through
14955  case X86::ATOMUMAX8:
14956  case X86::ATOMUMAX16:
14957  case X86::ATOMUMAX32:
14958  case X86::ATOMUMAX64:
14959    // Fall through
14960  case X86::ATOMUMIN8:
14961  case X86::ATOMUMIN16:
14962  case X86::ATOMUMIN32:
14963  case X86::ATOMUMIN64:
14964    return EmitAtomicLoadArith(MI, BB);
14965
14966  // This group does 64-bit operations on a 32-bit host.
14967  case X86::ATOMAND6432:
14968  case X86::ATOMOR6432:
14969  case X86::ATOMXOR6432:
14970  case X86::ATOMNAND6432:
14971  case X86::ATOMADD6432:
14972  case X86::ATOMSUB6432:
14973  case X86::ATOMMAX6432:
14974  case X86::ATOMMIN6432:
14975  case X86::ATOMUMAX6432:
14976  case X86::ATOMUMIN6432:
14977  case X86::ATOMSWAP6432:
14978    return EmitAtomicLoadArith6432(MI, BB);
14979
14980  case X86::VASTART_SAVE_XMM_REGS:
14981    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
14982
14983  case X86::VAARG_64:
14984    return EmitVAARG64WithCustomInserter(MI, BB);
14985
14986  case X86::EH_SjLj_SetJmp32:
14987  case X86::EH_SjLj_SetJmp64:
14988    return emitEHSjLjSetJmp(MI, BB);
14989
14990  case X86::EH_SjLj_LongJmp32:
14991  case X86::EH_SjLj_LongJmp64:
14992    return emitEHSjLjLongJmp(MI, BB);
14993  }
14994}
14995
14996//===----------------------------------------------------------------------===//
14997//                           X86 Optimization Hooks
14998//===----------------------------------------------------------------------===//
14999
15000void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
15001                                                       APInt &KnownZero,
15002                                                       APInt &KnownOne,
15003                                                       const SelectionDAG &DAG,
15004                                                       unsigned Depth) const {
15005  unsigned BitWidth = KnownZero.getBitWidth();
15006  unsigned Opc = Op.getOpcode();
15007  assert((Opc >= ISD::BUILTIN_OP_END ||
15008          Opc == ISD::INTRINSIC_WO_CHAIN ||
15009          Opc == ISD::INTRINSIC_W_CHAIN ||
15010          Opc == ISD::INTRINSIC_VOID) &&
15011         "Should use MaskedValueIsZero if you don't know whether Op"
15012         " is a target node!");
15013
15014  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
15015  switch (Opc) {
15016  default: break;
15017  case X86ISD::ADD:
15018  case X86ISD::SUB:
15019  case X86ISD::ADC:
15020  case X86ISD::SBB:
15021  case X86ISD::SMUL:
15022  case X86ISD::UMUL:
15023  case X86ISD::INC:
15024  case X86ISD::DEC:
15025  case X86ISD::OR:
15026  case X86ISD::XOR:
15027  case X86ISD::AND:
15028    // These nodes' second result is a boolean.
15029    if (Op.getResNo() == 0)
15030      break;
15031    // Fallthrough
15032  case X86ISD::SETCC:
15033    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
15034    break;
15035  case ISD::INTRINSIC_WO_CHAIN: {
15036    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
15037    unsigned NumLoBits = 0;
15038    switch (IntId) {
15039    default: break;
15040    case Intrinsic::x86_sse_movmsk_ps:
15041    case Intrinsic::x86_avx_movmsk_ps_256:
15042    case Intrinsic::x86_sse2_movmsk_pd:
15043    case Intrinsic::x86_avx_movmsk_pd_256:
15044    case Intrinsic::x86_mmx_pmovmskb:
15045    case Intrinsic::x86_sse2_pmovmskb_128:
15046    case Intrinsic::x86_avx2_pmovmskb: {
15047      // High bits of movmskp{s|d}, pmovmskb are known zero.
15048      switch (IntId) {
15049        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
15050        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
15051        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
15052        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
15053        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
15054        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
15055        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
15056        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
15057      }
15058      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
15059      break;
15060    }
15061    }
15062    break;
15063  }
15064  }
15065}
15066
15067unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
15068                                                         unsigned Depth) const {
15069  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
15070  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
15071    return Op.getValueType().getScalarType().getSizeInBits();
15072
15073  // Fallback case.
15074  return 1;
15075}
15076
15077/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
15078/// node is a GlobalAddress + offset.
15079bool X86TargetLowering::isGAPlusOffset(SDNode *N,
15080                                       const GlobalValue* &GA,
15081                                       int64_t &Offset) const {
15082  if (N->getOpcode() == X86ISD::Wrapper) {
15083    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
15084      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
15085      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
15086      return true;
15087    }
15088  }
15089  return TargetLowering::isGAPlusOffset(N, GA, Offset);
15090}
15091
15092/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
15093/// same as extracting the high 128-bit part of 256-bit vector and then
15094/// inserting the result into the low part of a new 256-bit vector
15095static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
15096  EVT VT = SVOp->getValueType(0);
15097  unsigned NumElems = VT.getVectorNumElements();
15098
15099  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15100  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
15101    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
15102        SVOp->getMaskElt(j) >= 0)
15103      return false;
15104
15105  return true;
15106}
15107
15108/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
15109/// same as extracting the low 128-bit part of 256-bit vector and then
15110/// inserting the result into the high part of a new 256-bit vector
15111static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
15112  EVT VT = SVOp->getValueType(0);
15113  unsigned NumElems = VT.getVectorNumElements();
15114
15115  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15116  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
15117    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
15118        SVOp->getMaskElt(j) >= 0)
15119      return false;
15120
15121  return true;
15122}
15123
15124/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
15125static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
15126                                        TargetLowering::DAGCombinerInfo &DCI,
15127                                        const X86Subtarget* Subtarget) {
15128  SDLoc dl(N);
15129  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
15130  SDValue V1 = SVOp->getOperand(0);
15131  SDValue V2 = SVOp->getOperand(1);
15132  EVT VT = SVOp->getValueType(0);
15133  unsigned NumElems = VT.getVectorNumElements();
15134
15135  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
15136      V2.getOpcode() == ISD::CONCAT_VECTORS) {
15137    //
15138    //                   0,0,0,...
15139    //                      |
15140    //    V      UNDEF    BUILD_VECTOR    UNDEF
15141    //     \      /           \           /
15142    //  CONCAT_VECTOR         CONCAT_VECTOR
15143    //         \                  /
15144    //          \                /
15145    //          RESULT: V + zero extended
15146    //
15147    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
15148        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
15149        V1.getOperand(1).getOpcode() != ISD::UNDEF)
15150      return SDValue();
15151
15152    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
15153      return SDValue();
15154
15155    // To match the shuffle mask, the first half of the mask should
15156    // be exactly the first vector, and all the rest a splat with the
15157    // first element of the second one.
15158    for (unsigned i = 0; i != NumElems/2; ++i)
15159      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
15160          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
15161        return SDValue();
15162
15163    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
15164    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
15165      if (Ld->hasNUsesOfValue(1, 0)) {
15166        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
15167        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
15168        SDValue ResNode =
15169          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
15170                                  array_lengthof(Ops),
15171                                  Ld->getMemoryVT(),
15172                                  Ld->getPointerInfo(),
15173                                  Ld->getAlignment(),
15174                                  false/*isVolatile*/, true/*ReadMem*/,
15175                                  false/*WriteMem*/);
15176
15177        // Make sure the newly-created LOAD is in the same position as Ld in
15178        // terms of dependency. We create a TokenFactor for Ld and ResNode,
15179        // and update uses of Ld's output chain to use the TokenFactor.
15180        if (Ld->hasAnyUseOfValue(1)) {
15181          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
15182                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
15183          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
15184          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
15185                                 SDValue(ResNode.getNode(), 1));
15186        }
15187
15188        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
15189      }
15190    }
15191
15192    // Emit a zeroed vector and insert the desired subvector on its
15193    // first half.
15194    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
15195    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
15196    return DCI.CombineTo(N, InsV);
15197  }
15198
15199  //===--------------------------------------------------------------------===//
15200  // Combine some shuffles into subvector extracts and inserts:
15201  //
15202
15203  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15204  if (isShuffleHigh128VectorInsertLow(SVOp)) {
15205    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
15206    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
15207    return DCI.CombineTo(N, InsV);
15208  }
15209
15210  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15211  if (isShuffleLow128VectorInsertHigh(SVOp)) {
15212    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
15213    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
15214    return DCI.CombineTo(N, InsV);
15215  }
15216
15217  return SDValue();
15218}
15219
15220/// PerformShuffleCombine - Performs several different shuffle combines.
15221static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
15222                                     TargetLowering::DAGCombinerInfo &DCI,
15223                                     const X86Subtarget *Subtarget) {
15224  SDLoc dl(N);
15225  EVT VT = N->getValueType(0);
15226
15227  // Don't create instructions with illegal types after legalize types has run.
15228  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15229  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
15230    return SDValue();
15231
15232  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
15233  if (Subtarget->hasFp256() && VT.is256BitVector() &&
15234      N->getOpcode() == ISD::VECTOR_SHUFFLE)
15235    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
15236
15237  // Only handle 128 wide vector from here on.
15238  if (!VT.is128BitVector())
15239    return SDValue();
15240
15241  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
15242  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
15243  // consecutive, non-overlapping, and in the right order.
15244  SmallVector<SDValue, 16> Elts;
15245  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
15246    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
15247
15248  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
15249}
15250
15251/// PerformTruncateCombine - Converts truncate operation to
15252/// a sequence of vector shuffle operations.
15253/// It is possible when we truncate 256-bit vector to 128-bit vector
15254static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
15255                                      TargetLowering::DAGCombinerInfo &DCI,
15256                                      const X86Subtarget *Subtarget)  {
15257  return SDValue();
15258}
15259
15260/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
15261/// specific shuffle of a load can be folded into a single element load.
15262/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
15263/// shuffles have been customed lowered so we need to handle those here.
15264static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
15265                                         TargetLowering::DAGCombinerInfo &DCI) {
15266  if (DCI.isBeforeLegalizeOps())
15267    return SDValue();
15268
15269  SDValue InVec = N->getOperand(0);
15270  SDValue EltNo = N->getOperand(1);
15271
15272  if (!isa<ConstantSDNode>(EltNo))
15273    return SDValue();
15274
15275  EVT VT = InVec.getValueType();
15276
15277  bool HasShuffleIntoBitcast = false;
15278  if (InVec.getOpcode() == ISD::BITCAST) {
15279    // Don't duplicate a load with other uses.
15280    if (!InVec.hasOneUse())
15281      return SDValue();
15282    EVT BCVT = InVec.getOperand(0).getValueType();
15283    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
15284      return SDValue();
15285    InVec = InVec.getOperand(0);
15286    HasShuffleIntoBitcast = true;
15287  }
15288
15289  if (!isTargetShuffle(InVec.getOpcode()))
15290    return SDValue();
15291
15292  // Don't duplicate a load with other uses.
15293  if (!InVec.hasOneUse())
15294    return SDValue();
15295
15296  SmallVector<int, 16> ShuffleMask;
15297  bool UnaryShuffle;
15298  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
15299                            UnaryShuffle))
15300    return SDValue();
15301
15302  // Select the input vector, guarding against out of range extract vector.
15303  unsigned NumElems = VT.getVectorNumElements();
15304  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
15305  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
15306  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
15307                                         : InVec.getOperand(1);
15308
15309  // If inputs to shuffle are the same for both ops, then allow 2 uses
15310  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
15311
15312  if (LdNode.getOpcode() == ISD::BITCAST) {
15313    // Don't duplicate a load with other uses.
15314    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
15315      return SDValue();
15316
15317    AllowedUses = 1; // only allow 1 load use if we have a bitcast
15318    LdNode = LdNode.getOperand(0);
15319  }
15320
15321  if (!ISD::isNormalLoad(LdNode.getNode()))
15322    return SDValue();
15323
15324  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
15325
15326  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
15327    return SDValue();
15328
15329  if (HasShuffleIntoBitcast) {
15330    // If there's a bitcast before the shuffle, check if the load type and
15331    // alignment is valid.
15332    unsigned Align = LN0->getAlignment();
15333    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15334    unsigned NewAlign = TLI.getDataLayout()->
15335      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
15336
15337    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
15338      return SDValue();
15339  }
15340
15341  // All checks match so transform back to vector_shuffle so that DAG combiner
15342  // can finish the job
15343  SDLoc dl(N);
15344
15345  // Create shuffle node taking into account the case that its a unary shuffle
15346  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
15347  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
15348                                 InVec.getOperand(0), Shuffle,
15349                                 &ShuffleMask[0]);
15350  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
15351  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
15352                     EltNo);
15353}
15354
15355/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
15356/// generation and convert it from being a bunch of shuffles and extracts
15357/// to a simple store and scalar loads to extract the elements.
15358static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
15359                                         TargetLowering::DAGCombinerInfo &DCI) {
15360  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
15361  if (NewOp.getNode())
15362    return NewOp;
15363
15364  SDValue InputVector = N->getOperand(0);
15365  // Detect whether we are trying to convert from mmx to i32 and the bitcast
15366  // from mmx to v2i32 has a single usage.
15367  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
15368      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
15369      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
15370    return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
15371                       N->getValueType(0),
15372                       InputVector.getNode()->getOperand(0));
15373
15374  // Only operate on vectors of 4 elements, where the alternative shuffling
15375  // gets to be more expensive.
15376  if (InputVector.getValueType() != MVT::v4i32)
15377    return SDValue();
15378
15379  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
15380  // single use which is a sign-extend or zero-extend, and all elements are
15381  // used.
15382  SmallVector<SDNode *, 4> Uses;
15383  unsigned ExtractedElements = 0;
15384  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
15385       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
15386    if (UI.getUse().getResNo() != InputVector.getResNo())
15387      return SDValue();
15388
15389    SDNode *Extract = *UI;
15390    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15391      return SDValue();
15392
15393    if (Extract->getValueType(0) != MVT::i32)
15394      return SDValue();
15395    if (!Extract->hasOneUse())
15396      return SDValue();
15397    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
15398        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
15399      return SDValue();
15400    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
15401      return SDValue();
15402
15403    // Record which element was extracted.
15404    ExtractedElements |=
15405      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
15406
15407    Uses.push_back(Extract);
15408  }
15409
15410  // If not all the elements were used, this may not be worthwhile.
15411  if (ExtractedElements != 15)
15412    return SDValue();
15413
15414  // Ok, we've now decided to do the transformation.
15415  SDLoc dl(InputVector);
15416
15417  // Store the value to a temporary stack slot.
15418  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
15419  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
15420                            MachinePointerInfo(), false, false, 0);
15421
15422  // Replace each use (extract) with a load of the appropriate element.
15423  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
15424       UE = Uses.end(); UI != UE; ++UI) {
15425    SDNode *Extract = *UI;
15426
15427    // cOMpute the element's address.
15428    SDValue Idx = Extract->getOperand(1);
15429    unsigned EltSize =
15430        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
15431    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
15432    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15433    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
15434
15435    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
15436                                     StackPtr, OffsetVal);
15437
15438    // Load the scalar.
15439    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
15440                                     ScalarAddr, MachinePointerInfo(),
15441                                     false, false, false, 0);
15442
15443    // Replace the exact with the load.
15444    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
15445  }
15446
15447  // The replacement was made in place; don't return anything.
15448  return SDValue();
15449}
15450
15451/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
15452static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
15453                                   SDValue RHS, SelectionDAG &DAG,
15454                                   const X86Subtarget *Subtarget) {
15455  if (!VT.isVector())
15456    return 0;
15457
15458  switch (VT.getSimpleVT().SimpleTy) {
15459  default: return 0;
15460  case MVT::v32i8:
15461  case MVT::v16i16:
15462  case MVT::v8i32:
15463    if (!Subtarget->hasAVX2())
15464      return 0;
15465  case MVT::v16i8:
15466  case MVT::v8i16:
15467  case MVT::v4i32:
15468    if (!Subtarget->hasSSE2())
15469      return 0;
15470  }
15471
15472  // SSE2 has only a small subset of the operations.
15473  bool hasUnsigned = Subtarget->hasSSE41() ||
15474                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
15475  bool hasSigned = Subtarget->hasSSE41() ||
15476                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
15477
15478  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
15479
15480  // Check for x CC y ? x : y.
15481  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
15482      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
15483    switch (CC) {
15484    default: break;
15485    case ISD::SETULT:
15486    case ISD::SETULE:
15487      return hasUnsigned ? X86ISD::UMIN : 0;
15488    case ISD::SETUGT:
15489    case ISD::SETUGE:
15490      return hasUnsigned ? X86ISD::UMAX : 0;
15491    case ISD::SETLT:
15492    case ISD::SETLE:
15493      return hasSigned ? X86ISD::SMIN : 0;
15494    case ISD::SETGT:
15495    case ISD::SETGE:
15496      return hasSigned ? X86ISD::SMAX : 0;
15497    }
15498  // Check for x CC y ? y : x -- a min/max with reversed arms.
15499  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
15500             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
15501    switch (CC) {
15502    default: break;
15503    case ISD::SETULT:
15504    case ISD::SETULE:
15505      return hasUnsigned ? X86ISD::UMAX : 0;
15506    case ISD::SETUGT:
15507    case ISD::SETUGE:
15508      return hasUnsigned ? X86ISD::UMIN : 0;
15509    case ISD::SETLT:
15510    case ISD::SETLE:
15511      return hasSigned ? X86ISD::SMAX : 0;
15512    case ISD::SETGT:
15513    case ISD::SETGE:
15514      return hasSigned ? X86ISD::SMIN : 0;
15515    }
15516  }
15517
15518  return 0;
15519}
15520
15521/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
15522/// nodes.
15523static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
15524                                    TargetLowering::DAGCombinerInfo &DCI,
15525                                    const X86Subtarget *Subtarget) {
15526  SDLoc DL(N);
15527  SDValue Cond = N->getOperand(0);
15528  // Get the LHS/RHS of the select.
15529  SDValue LHS = N->getOperand(1);
15530  SDValue RHS = N->getOperand(2);
15531  EVT VT = LHS.getValueType();
15532
15533  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
15534  // instructions match the semantics of the common C idiom x<y?x:y but not
15535  // x<=y?x:y, because of how they handle negative zero (which can be
15536  // ignored in unsafe-math mode).
15537  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
15538      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
15539      (Subtarget->hasSSE2() ||
15540       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
15541    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
15542
15543    unsigned Opcode = 0;
15544    // Check for x CC y ? x : y.
15545    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
15546        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
15547      switch (CC) {
15548      default: break;
15549      case ISD::SETULT:
15550        // Converting this to a min would handle NaNs incorrectly, and swapping
15551        // the operands would cause it to handle comparisons between positive
15552        // and negative zero incorrectly.
15553        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
15554          if (!DAG.getTarget().Options.UnsafeFPMath &&
15555              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
15556            break;
15557          std::swap(LHS, RHS);
15558        }
15559        Opcode = X86ISD::FMIN;
15560        break;
15561      case ISD::SETOLE:
15562        // Converting this to a min would handle comparisons between positive
15563        // and negative zero incorrectly.
15564        if (!DAG.getTarget().Options.UnsafeFPMath &&
15565            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
15566          break;
15567        Opcode = X86ISD::FMIN;
15568        break;
15569      case ISD::SETULE:
15570        // Converting this to a min would handle both negative zeros and NaNs
15571        // incorrectly, but we can swap the operands to fix both.
15572        std::swap(LHS, RHS);
15573      case ISD::SETOLT:
15574      case ISD::SETLT:
15575      case ISD::SETLE:
15576        Opcode = X86ISD::FMIN;
15577        break;
15578
15579      case ISD::SETOGE:
15580        // Converting this to a max would handle comparisons between positive
15581        // and negative zero incorrectly.
15582        if (!DAG.getTarget().Options.UnsafeFPMath &&
15583            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
15584          break;
15585        Opcode = X86ISD::FMAX;
15586        break;
15587      case ISD::SETUGT:
15588        // Converting this to a max would handle NaNs incorrectly, and swapping
15589        // the operands would cause it to handle comparisons between positive
15590        // and negative zero incorrectly.
15591        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
15592          if (!DAG.getTarget().Options.UnsafeFPMath &&
15593              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
15594            break;
15595          std::swap(LHS, RHS);
15596        }
15597        Opcode = X86ISD::FMAX;
15598        break;
15599      case ISD::SETUGE:
15600        // Converting this to a max would handle both negative zeros and NaNs
15601        // incorrectly, but we can swap the operands to fix both.
15602        std::swap(LHS, RHS);
15603      case ISD::SETOGT:
15604      case ISD::SETGT:
15605      case ISD::SETGE:
15606        Opcode = X86ISD::FMAX;
15607        break;
15608      }
15609    // Check for x CC y ? y : x -- a min/max with reversed arms.
15610    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
15611               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
15612      switch (CC) {
15613      default: break;
15614      case ISD::SETOGE:
15615        // Converting this to a min would handle comparisons between positive
15616        // and negative zero incorrectly, and swapping the operands would
15617        // cause it to handle NaNs incorrectly.
15618        if (!DAG.getTarget().Options.UnsafeFPMath &&
15619            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
15620          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
15621            break;
15622          std::swap(LHS, RHS);
15623        }
15624        Opcode = X86ISD::FMIN;
15625        break;
15626      case ISD::SETUGT:
15627        // Converting this to a min would handle NaNs incorrectly.
15628        if (!DAG.getTarget().Options.UnsafeFPMath &&
15629            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
15630          break;
15631        Opcode = X86ISD::FMIN;
15632        break;
15633      case ISD::SETUGE:
15634        // Converting this to a min would handle both negative zeros and NaNs
15635        // incorrectly, but we can swap the operands to fix both.
15636        std::swap(LHS, RHS);
15637      case ISD::SETOGT:
15638      case ISD::SETGT:
15639      case ISD::SETGE:
15640        Opcode = X86ISD::FMIN;
15641        break;
15642
15643      case ISD::SETULT:
15644        // Converting this to a max would handle NaNs incorrectly.
15645        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
15646          break;
15647        Opcode = X86ISD::FMAX;
15648        break;
15649      case ISD::SETOLE:
15650        // Converting this to a max would handle comparisons between positive
15651        // and negative zero incorrectly, and swapping the operands would
15652        // cause it to handle NaNs incorrectly.
15653        if (!DAG.getTarget().Options.UnsafeFPMath &&
15654            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
15655          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
15656            break;
15657          std::swap(LHS, RHS);
15658        }
15659        Opcode = X86ISD::FMAX;
15660        break;
15661      case ISD::SETULE:
15662        // Converting this to a max would handle both negative zeros and NaNs
15663        // incorrectly, but we can swap the operands to fix both.
15664        std::swap(LHS, RHS);
15665      case ISD::SETOLT:
15666      case ISD::SETLT:
15667      case ISD::SETLE:
15668        Opcode = X86ISD::FMAX;
15669        break;
15670      }
15671    }
15672
15673    if (Opcode)
15674      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
15675  }
15676
15677  // If this is a select between two integer constants, try to do some
15678  // optimizations.
15679  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
15680    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
15681      // Don't do this for crazy integer types.
15682      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
15683        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
15684        // so that TrueC (the true value) is larger than FalseC.
15685        bool NeedsCondInvert = false;
15686
15687        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
15688            // Efficiently invertible.
15689            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
15690             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
15691              isa<ConstantSDNode>(Cond.getOperand(1))))) {
15692          NeedsCondInvert = true;
15693          std::swap(TrueC, FalseC);
15694        }
15695
15696        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
15697        if (FalseC->getAPIntValue() == 0 &&
15698            TrueC->getAPIntValue().isPowerOf2()) {
15699          if (NeedsCondInvert) // Invert the condition if needed.
15700            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
15701                               DAG.getConstant(1, Cond.getValueType()));
15702
15703          // Zero extend the condition if needed.
15704          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
15705
15706          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
15707          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
15708                             DAG.getConstant(ShAmt, MVT::i8));
15709        }
15710
15711        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
15712        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
15713          if (NeedsCondInvert) // Invert the condition if needed.
15714            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
15715                               DAG.getConstant(1, Cond.getValueType()));
15716
15717          // Zero extend the condition if needed.
15718          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
15719                             FalseC->getValueType(0), Cond);
15720          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
15721                             SDValue(FalseC, 0));
15722        }
15723
15724        // Optimize cases that will turn into an LEA instruction.  This requires
15725        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
15726        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
15727          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
15728          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
15729
15730          bool isFastMultiplier = false;
15731          if (Diff < 10) {
15732            switch ((unsigned char)Diff) {
15733              default: break;
15734              case 1:  // result = add base, cond
15735              case 2:  // result = lea base(    , cond*2)
15736              case 3:  // result = lea base(cond, cond*2)
15737              case 4:  // result = lea base(    , cond*4)
15738              case 5:  // result = lea base(cond, cond*4)
15739              case 8:  // result = lea base(    , cond*8)
15740              case 9:  // result = lea base(cond, cond*8)
15741                isFastMultiplier = true;
15742                break;
15743            }
15744          }
15745
15746          if (isFastMultiplier) {
15747            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
15748            if (NeedsCondInvert) // Invert the condition if needed.
15749              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
15750                                 DAG.getConstant(1, Cond.getValueType()));
15751
15752            // Zero extend the condition if needed.
15753            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
15754                               Cond);
15755            // Scale the condition by the difference.
15756            if (Diff != 1)
15757              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
15758                                 DAG.getConstant(Diff, Cond.getValueType()));
15759
15760            // Add the base if non-zero.
15761            if (FalseC->getAPIntValue() != 0)
15762              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
15763                                 SDValue(FalseC, 0));
15764            return Cond;
15765          }
15766        }
15767      }
15768  }
15769
15770  // Canonicalize max and min:
15771  // (x > y) ? x : y -> (x >= y) ? x : y
15772  // (x < y) ? x : y -> (x <= y) ? x : y
15773  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
15774  // the need for an extra compare
15775  // against zero. e.g.
15776  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
15777  // subl   %esi, %edi
15778  // testl  %edi, %edi
15779  // movl   $0, %eax
15780  // cmovgl %edi, %eax
15781  // =>
15782  // xorl   %eax, %eax
15783  // subl   %esi, $edi
15784  // cmovsl %eax, %edi
15785  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
15786      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
15787      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
15788    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
15789    switch (CC) {
15790    default: break;
15791    case ISD::SETLT:
15792    case ISD::SETGT: {
15793      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
15794      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
15795                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
15796      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
15797    }
15798    }
15799  }
15800
15801  // Match VSELECTs into subs with unsigned saturation.
15802  if (!DCI.isBeforeLegalize() &&
15803      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
15804      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
15805      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
15806       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
15807    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
15808
15809    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
15810    // left side invert the predicate to simplify logic below.
15811    SDValue Other;
15812    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
15813      Other = RHS;
15814      CC = ISD::getSetCCInverse(CC, true);
15815    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
15816      Other = LHS;
15817    }
15818
15819    if (Other.getNode() && Other->getNumOperands() == 2 &&
15820        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
15821      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
15822      SDValue CondRHS = Cond->getOperand(1);
15823
15824      // Look for a general sub with unsigned saturation first.
15825      // x >= y ? x-y : 0 --> subus x, y
15826      // x >  y ? x-y : 0 --> subus x, y
15827      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
15828          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
15829        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
15830
15831      // If the RHS is a constant we have to reverse the const canonicalization.
15832      // x > C-1 ? x+-C : 0 --> subus x, C
15833      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
15834          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
15835        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
15836        if (CondRHS.getConstantOperandVal(0) == -A-1)
15837          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
15838                             DAG.getConstant(-A, VT));
15839      }
15840
15841      // Another special case: If C was a sign bit, the sub has been
15842      // canonicalized into a xor.
15843      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
15844      //        it's safe to decanonicalize the xor?
15845      // x s< 0 ? x^C : 0 --> subus x, C
15846      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
15847          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
15848          isSplatVector(OpRHS.getNode())) {
15849        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
15850        if (A.isSignBit())
15851          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
15852      }
15853    }
15854  }
15855
15856  // Try to match a min/max vector operation.
15857  if (!DCI.isBeforeLegalize() &&
15858      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
15859    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
15860      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
15861
15862  // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
15863  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
15864      Cond.getOpcode() == ISD::SETCC) {
15865
15866    assert(Cond.getValueType().isVector() &&
15867           "vector select expects a vector selector!");
15868
15869    EVT IntVT = Cond.getValueType();
15870    bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
15871    bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
15872
15873    if (!TValIsAllOnes && !FValIsAllZeros) {
15874      // Try invert the condition if true value is not all 1s and false value
15875      // is not all 0s.
15876      bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
15877      bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
15878
15879      if (TValIsAllZeros || FValIsAllOnes) {
15880        SDValue CC = Cond.getOperand(2);
15881        ISD::CondCode NewCC =
15882          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
15883                               Cond.getOperand(0).getValueType().isInteger());
15884        Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
15885        std::swap(LHS, RHS);
15886        TValIsAllOnes = FValIsAllOnes;
15887        FValIsAllZeros = TValIsAllZeros;
15888      }
15889    }
15890
15891    if (TValIsAllOnes || FValIsAllZeros) {
15892      SDValue Ret;
15893
15894      if (TValIsAllOnes && FValIsAllZeros)
15895        Ret = Cond;
15896      else if (TValIsAllOnes)
15897        Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
15898                          DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
15899      else if (FValIsAllZeros)
15900        Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
15901                          DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
15902
15903      return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
15904    }
15905  }
15906
15907  // If we know that this node is legal then we know that it is going to be
15908  // matched by one of the SSE/AVX BLEND instructions. These instructions only
15909  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
15910  // to simplify previous instructions.
15911  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15912  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
15913      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
15914    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
15915
15916    // Don't optimize vector selects that map to mask-registers.
15917    if (BitWidth == 1)
15918      return SDValue();
15919
15920    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
15921    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
15922
15923    APInt KnownZero, KnownOne;
15924    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
15925                                          DCI.isBeforeLegalizeOps());
15926    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
15927        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
15928      DCI.CommitTargetLoweringOpt(TLO);
15929  }
15930
15931  return SDValue();
15932}
15933
15934// Check whether a boolean test is testing a boolean value generated by
15935// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
15936// code.
15937//
15938// Simplify the following patterns:
15939// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
15940// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
15941// to (Op EFLAGS Cond)
15942//
15943// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
15944// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
15945// to (Op EFLAGS !Cond)
15946//
15947// where Op could be BRCOND or CMOV.
15948//
15949static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
15950  // Quit if not CMP and SUB with its value result used.
15951  if (Cmp.getOpcode() != X86ISD::CMP &&
15952      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
15953      return SDValue();
15954
15955  // Quit if not used as a boolean value.
15956  if (CC != X86::COND_E && CC != X86::COND_NE)
15957    return SDValue();
15958
15959  // Check CMP operands. One of them should be 0 or 1 and the other should be
15960  // an SetCC or extended from it.
15961  SDValue Op1 = Cmp.getOperand(0);
15962  SDValue Op2 = Cmp.getOperand(1);
15963
15964  SDValue SetCC;
15965  const ConstantSDNode* C = 0;
15966  bool needOppositeCond = (CC == X86::COND_E);
15967  bool checkAgainstTrue = false; // Is it a comparison against 1?
15968
15969  if ((C = dyn_cast<ConstantSDNode>(Op1)))
15970    SetCC = Op2;
15971  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
15972    SetCC = Op1;
15973  else // Quit if all operands are not constants.
15974    return SDValue();
15975
15976  if (C->getZExtValue() == 1) {
15977    needOppositeCond = !needOppositeCond;
15978    checkAgainstTrue = true;
15979  } else if (C->getZExtValue() != 0)
15980    // Quit if the constant is neither 0 or 1.
15981    return SDValue();
15982
15983  bool truncatedToBoolWithAnd = false;
15984  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
15985  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
15986         SetCC.getOpcode() == ISD::TRUNCATE ||
15987         SetCC.getOpcode() == ISD::AND) {
15988    if (SetCC.getOpcode() == ISD::AND) {
15989      int OpIdx = -1;
15990      ConstantSDNode *CS;
15991      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
15992          CS->getZExtValue() == 1)
15993        OpIdx = 1;
15994      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
15995          CS->getZExtValue() == 1)
15996        OpIdx = 0;
15997      if (OpIdx == -1)
15998        break;
15999      SetCC = SetCC.getOperand(OpIdx);
16000      truncatedToBoolWithAnd = true;
16001    } else
16002      SetCC = SetCC.getOperand(0);
16003  }
16004
16005  switch (SetCC.getOpcode()) {
16006  case X86ISD::SETCC_CARRY:
16007    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
16008    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
16009    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
16010    // truncated to i1 using 'and'.
16011    if (checkAgainstTrue && !truncatedToBoolWithAnd)
16012      break;
16013    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
16014           "Invalid use of SETCC_CARRY!");
16015    // FALL THROUGH
16016  case X86ISD::SETCC:
16017    // Set the condition code or opposite one if necessary.
16018    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
16019    if (needOppositeCond)
16020      CC = X86::GetOppositeBranchCondition(CC);
16021    return SetCC.getOperand(1);
16022  case X86ISD::CMOV: {
16023    // Check whether false/true value has canonical one, i.e. 0 or 1.
16024    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
16025    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
16026    // Quit if true value is not a constant.
16027    if (!TVal)
16028      return SDValue();
16029    // Quit if false value is not a constant.
16030    if (!FVal) {
16031      SDValue Op = SetCC.getOperand(0);
16032      // Skip 'zext' or 'trunc' node.
16033      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
16034          Op.getOpcode() == ISD::TRUNCATE)
16035        Op = Op.getOperand(0);
16036      // A special case for rdrand/rdseed, where 0 is set if false cond is
16037      // found.
16038      if ((Op.getOpcode() != X86ISD::RDRAND &&
16039           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
16040        return SDValue();
16041    }
16042    // Quit if false value is not the constant 0 or 1.
16043    bool FValIsFalse = true;
16044    if (FVal && FVal->getZExtValue() != 0) {
16045      if (FVal->getZExtValue() != 1)
16046        return SDValue();
16047      // If FVal is 1, opposite cond is needed.
16048      needOppositeCond = !needOppositeCond;
16049      FValIsFalse = false;
16050    }
16051    // Quit if TVal is not the constant opposite of FVal.
16052    if (FValIsFalse && TVal->getZExtValue() != 1)
16053      return SDValue();
16054    if (!FValIsFalse && TVal->getZExtValue() != 0)
16055      return SDValue();
16056    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
16057    if (needOppositeCond)
16058      CC = X86::GetOppositeBranchCondition(CC);
16059    return SetCC.getOperand(3);
16060  }
16061  }
16062
16063  return SDValue();
16064}
16065
16066/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
16067static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
16068                                  TargetLowering::DAGCombinerInfo &DCI,
16069                                  const X86Subtarget *Subtarget) {
16070  SDLoc DL(N);
16071
16072  // If the flag operand isn't dead, don't touch this CMOV.
16073  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
16074    return SDValue();
16075
16076  SDValue FalseOp = N->getOperand(0);
16077  SDValue TrueOp = N->getOperand(1);
16078  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
16079  SDValue Cond = N->getOperand(3);
16080
16081  if (CC == X86::COND_E || CC == X86::COND_NE) {
16082    switch (Cond.getOpcode()) {
16083    default: break;
16084    case X86ISD::BSR:
16085    case X86ISD::BSF:
16086      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
16087      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
16088        return (CC == X86::COND_E) ? FalseOp : TrueOp;
16089    }
16090  }
16091
16092  SDValue Flags;
16093
16094  Flags = checkBoolTestSetCCCombine(Cond, CC);
16095  if (Flags.getNode() &&
16096      // Extra check as FCMOV only supports a subset of X86 cond.
16097      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
16098    SDValue Ops[] = { FalseOp, TrueOp,
16099                      DAG.getConstant(CC, MVT::i8), Flags };
16100    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
16101                       Ops, array_lengthof(Ops));
16102  }
16103
16104  // If this is a select between two integer constants, try to do some
16105  // optimizations.  Note that the operands are ordered the opposite of SELECT
16106  // operands.
16107  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
16108    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
16109      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
16110      // larger than FalseC (the false value).
16111      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
16112        CC = X86::GetOppositeBranchCondition(CC);
16113        std::swap(TrueC, FalseC);
16114        std::swap(TrueOp, FalseOp);
16115      }
16116
16117      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
16118      // This is efficient for any integer data type (including i8/i16) and
16119      // shift amount.
16120      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
16121        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
16122                           DAG.getConstant(CC, MVT::i8), Cond);
16123
16124        // Zero extend the condition if needed.
16125        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
16126
16127        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
16128        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
16129                           DAG.getConstant(ShAmt, MVT::i8));
16130        if (N->getNumValues() == 2)  // Dead flag value?
16131          return DCI.CombineTo(N, Cond, SDValue());
16132        return Cond;
16133      }
16134
16135      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
16136      // for any integer data type, including i8/i16.
16137      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
16138        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
16139                           DAG.getConstant(CC, MVT::i8), Cond);
16140
16141        // Zero extend the condition if needed.
16142        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
16143                           FalseC->getValueType(0), Cond);
16144        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
16145                           SDValue(FalseC, 0));
16146
16147        if (N->getNumValues() == 2)  // Dead flag value?
16148          return DCI.CombineTo(N, Cond, SDValue());
16149        return Cond;
16150      }
16151
16152      // Optimize cases that will turn into an LEA instruction.  This requires
16153      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
16154      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
16155        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
16156        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
16157
16158        bool isFastMultiplier = false;
16159        if (Diff < 10) {
16160          switch ((unsigned char)Diff) {
16161          default: break;
16162          case 1:  // result = add base, cond
16163          case 2:  // result = lea base(    , cond*2)
16164          case 3:  // result = lea base(cond, cond*2)
16165          case 4:  // result = lea base(    , cond*4)
16166          case 5:  // result = lea base(cond, cond*4)
16167          case 8:  // result = lea base(    , cond*8)
16168          case 9:  // result = lea base(cond, cond*8)
16169            isFastMultiplier = true;
16170            break;
16171          }
16172        }
16173
16174        if (isFastMultiplier) {
16175          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
16176          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
16177                             DAG.getConstant(CC, MVT::i8), Cond);
16178          // Zero extend the condition if needed.
16179          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
16180                             Cond);
16181          // Scale the condition by the difference.
16182          if (Diff != 1)
16183            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
16184                               DAG.getConstant(Diff, Cond.getValueType()));
16185
16186          // Add the base if non-zero.
16187          if (FalseC->getAPIntValue() != 0)
16188            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
16189                               SDValue(FalseC, 0));
16190          if (N->getNumValues() == 2)  // Dead flag value?
16191            return DCI.CombineTo(N, Cond, SDValue());
16192          return Cond;
16193        }
16194      }
16195    }
16196  }
16197
16198  // Handle these cases:
16199  //   (select (x != c), e, c) -> select (x != c), e, x),
16200  //   (select (x == c), c, e) -> select (x == c), x, e)
16201  // where the c is an integer constant, and the "select" is the combination
16202  // of CMOV and CMP.
16203  //
16204  // The rationale for this change is that the conditional-move from a constant
16205  // needs two instructions, however, conditional-move from a register needs
16206  // only one instruction.
16207  //
16208  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
16209  //  some instruction-combining opportunities. This opt needs to be
16210  //  postponed as late as possible.
16211  //
16212  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
16213    // the DCI.xxxx conditions are provided to postpone the optimization as
16214    // late as possible.
16215
16216    ConstantSDNode *CmpAgainst = 0;
16217    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
16218        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
16219        !isa<ConstantSDNode>(Cond.getOperand(0))) {
16220
16221      if (CC == X86::COND_NE &&
16222          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
16223        CC = X86::GetOppositeBranchCondition(CC);
16224        std::swap(TrueOp, FalseOp);
16225      }
16226
16227      if (CC == X86::COND_E &&
16228          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
16229        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
16230                          DAG.getConstant(CC, MVT::i8), Cond };
16231        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
16232                           array_lengthof(Ops));
16233      }
16234    }
16235  }
16236
16237  return SDValue();
16238}
16239
16240/// PerformMulCombine - Optimize a single multiply with constant into two
16241/// in order to implement it with two cheaper instructions, e.g.
16242/// LEA + SHL, LEA + LEA.
16243static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
16244                                 TargetLowering::DAGCombinerInfo &DCI) {
16245  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16246    return SDValue();
16247
16248  EVT VT = N->getValueType(0);
16249  if (VT != MVT::i64)
16250    return SDValue();
16251
16252  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
16253  if (!C)
16254    return SDValue();
16255  uint64_t MulAmt = C->getZExtValue();
16256  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
16257    return SDValue();
16258
16259  uint64_t MulAmt1 = 0;
16260  uint64_t MulAmt2 = 0;
16261  if ((MulAmt % 9) == 0) {
16262    MulAmt1 = 9;
16263    MulAmt2 = MulAmt / 9;
16264  } else if ((MulAmt % 5) == 0) {
16265    MulAmt1 = 5;
16266    MulAmt2 = MulAmt / 5;
16267  } else if ((MulAmt % 3) == 0) {
16268    MulAmt1 = 3;
16269    MulAmt2 = MulAmt / 3;
16270  }
16271  if (MulAmt2 &&
16272      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
16273    SDLoc DL(N);
16274
16275    if (isPowerOf2_64(MulAmt2) &&
16276        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
16277      // If second multiplifer is pow2, issue it first. We want the multiply by
16278      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
16279      // is an add.
16280      std::swap(MulAmt1, MulAmt2);
16281
16282    SDValue NewMul;
16283    if (isPowerOf2_64(MulAmt1))
16284      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
16285                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
16286    else
16287      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
16288                           DAG.getConstant(MulAmt1, VT));
16289
16290    if (isPowerOf2_64(MulAmt2))
16291      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
16292                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
16293    else
16294      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
16295                           DAG.getConstant(MulAmt2, VT));
16296
16297    // Do not add new nodes to DAG combiner worklist.
16298    DCI.CombineTo(N, NewMul, false);
16299  }
16300  return SDValue();
16301}
16302
16303static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
16304  SDValue N0 = N->getOperand(0);
16305  SDValue N1 = N->getOperand(1);
16306  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
16307  EVT VT = N0.getValueType();
16308
16309  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
16310  // since the result of setcc_c is all zero's or all ones.
16311  if (VT.isInteger() && !VT.isVector() &&
16312      N1C && N0.getOpcode() == ISD::AND &&
16313      N0.getOperand(1).getOpcode() == ISD::Constant) {
16314    SDValue N00 = N0.getOperand(0);
16315    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
16316        ((N00.getOpcode() == ISD::ANY_EXTEND ||
16317          N00.getOpcode() == ISD::ZERO_EXTEND) &&
16318         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
16319      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
16320      APInt ShAmt = N1C->getAPIntValue();
16321      Mask = Mask.shl(ShAmt);
16322      if (Mask != 0)
16323        return DAG.getNode(ISD::AND, SDLoc(N), VT,
16324                           N00, DAG.getConstant(Mask, VT));
16325    }
16326  }
16327
16328  // Hardware support for vector shifts is sparse which makes us scalarize the
16329  // vector operations in many cases. Also, on sandybridge ADD is faster than
16330  // shl.
16331  // (shl V, 1) -> add V,V
16332  if (isSplatVector(N1.getNode())) {
16333    assert(N0.getValueType().isVector() && "Invalid vector shift type");
16334    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
16335    // We shift all of the values by one. In many cases we do not have
16336    // hardware support for this operation. This is better expressed as an ADD
16337    // of two values.
16338    if (N1C && (1 == N1C->getZExtValue())) {
16339      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
16340    }
16341  }
16342
16343  return SDValue();
16344}
16345
16346/// \brief Returns a vector of 0s if the node in input is a vector logical
16347/// shift by a constant amount which is known to be bigger than or equal
16348/// to the vector element size in bits.
16349static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
16350                                      const X86Subtarget *Subtarget) {
16351  EVT VT = N->getValueType(0);
16352
16353  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
16354      (!Subtarget->hasInt256() ||
16355       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
16356    return SDValue();
16357
16358  SDValue Amt = N->getOperand(1);
16359  SDLoc DL(N);
16360  if (isSplatVector(Amt.getNode())) {
16361    SDValue SclrAmt = Amt->getOperand(0);
16362    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
16363      APInt ShiftAmt = C->getAPIntValue();
16364      unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
16365
16366      // SSE2/AVX2 logical shifts always return a vector of 0s
16367      // if the shift amount is bigger than or equal to
16368      // the element size. The constant shift amount will be
16369      // encoded as a 8-bit immediate.
16370      if (ShiftAmt.trunc(8).uge(MaxAmount))
16371        return getZeroVector(VT, Subtarget, DAG, DL);
16372    }
16373  }
16374
16375  return SDValue();
16376}
16377
16378/// PerformShiftCombine - Combine shifts.
16379static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
16380                                   TargetLowering::DAGCombinerInfo &DCI,
16381                                   const X86Subtarget *Subtarget) {
16382  if (N->getOpcode() == ISD::SHL) {
16383    SDValue V = PerformSHLCombine(N, DAG);
16384    if (V.getNode()) return V;
16385  }
16386
16387  if (N->getOpcode() != ISD::SRA) {
16388    // Try to fold this logical shift into a zero vector.
16389    SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
16390    if (V.getNode()) return V;
16391  }
16392
16393  return SDValue();
16394}
16395
16396// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
16397// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
16398// and friends.  Likewise for OR -> CMPNEQSS.
16399static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
16400                            TargetLowering::DAGCombinerInfo &DCI,
16401                            const X86Subtarget *Subtarget) {
16402  unsigned opcode;
16403
16404  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
16405  // we're requiring SSE2 for both.
16406  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
16407    SDValue N0 = N->getOperand(0);
16408    SDValue N1 = N->getOperand(1);
16409    SDValue CMP0 = N0->getOperand(1);
16410    SDValue CMP1 = N1->getOperand(1);
16411    SDLoc DL(N);
16412
16413    // The SETCCs should both refer to the same CMP.
16414    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
16415      return SDValue();
16416
16417    SDValue CMP00 = CMP0->getOperand(0);
16418    SDValue CMP01 = CMP0->getOperand(1);
16419    EVT     VT    = CMP00.getValueType();
16420
16421    if (VT == MVT::f32 || VT == MVT::f64) {
16422      bool ExpectingFlags = false;
16423      // Check for any users that want flags:
16424      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
16425           !ExpectingFlags && UI != UE; ++UI)
16426        switch (UI->getOpcode()) {
16427        default:
16428        case ISD::BR_CC:
16429        case ISD::BRCOND:
16430        case ISD::SELECT:
16431          ExpectingFlags = true;
16432          break;
16433        case ISD::CopyToReg:
16434        case ISD::SIGN_EXTEND:
16435        case ISD::ZERO_EXTEND:
16436        case ISD::ANY_EXTEND:
16437          break;
16438        }
16439
16440      if (!ExpectingFlags) {
16441        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
16442        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
16443
16444        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
16445          X86::CondCode tmp = cc0;
16446          cc0 = cc1;
16447          cc1 = tmp;
16448        }
16449
16450        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
16451            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
16452          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
16453          X86ISD::NodeType NTOperator = is64BitFP ?
16454            X86ISD::FSETCCsd : X86ISD::FSETCCss;
16455          // FIXME: need symbolic constants for these magic numbers.
16456          // See X86ATTInstPrinter.cpp:printSSECC().
16457          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
16458          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
16459                                              DAG.getConstant(x86cc, MVT::i8));
16460          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
16461                                              OnesOrZeroesF);
16462          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
16463                                      DAG.getConstant(1, MVT::i32));
16464          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
16465          return OneBitOfTruth;
16466        }
16467      }
16468    }
16469  }
16470  return SDValue();
16471}
16472
16473/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
16474/// so it can be folded inside ANDNP.
16475static bool CanFoldXORWithAllOnes(const SDNode *N) {
16476  EVT VT = N->getValueType(0);
16477
16478  // Match direct AllOnes for 128 and 256-bit vectors
16479  if (ISD::isBuildVectorAllOnes(N))
16480    return true;
16481
16482  // Look through a bit convert.
16483  if (N->getOpcode() == ISD::BITCAST)
16484    N = N->getOperand(0).getNode();
16485
16486  // Sometimes the operand may come from a insert_subvector building a 256-bit
16487  // allones vector
16488  if (VT.is256BitVector() &&
16489      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
16490    SDValue V1 = N->getOperand(0);
16491    SDValue V2 = N->getOperand(1);
16492
16493    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
16494        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
16495        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
16496        ISD::isBuildVectorAllOnes(V2.getNode()))
16497      return true;
16498  }
16499
16500  return false;
16501}
16502
16503// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
16504// register. In most cases we actually compare or select YMM-sized registers
16505// and mixing the two types creates horrible code. This method optimizes
16506// some of the transition sequences.
16507static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
16508                                 TargetLowering::DAGCombinerInfo &DCI,
16509                                 const X86Subtarget *Subtarget) {
16510  EVT VT = N->getValueType(0);
16511  if (!VT.is256BitVector())
16512    return SDValue();
16513
16514  assert((N->getOpcode() == ISD::ANY_EXTEND ||
16515          N->getOpcode() == ISD::ZERO_EXTEND ||
16516          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
16517
16518  SDValue Narrow = N->getOperand(0);
16519  EVT NarrowVT = Narrow->getValueType(0);
16520  if (!NarrowVT.is128BitVector())
16521    return SDValue();
16522
16523  if (Narrow->getOpcode() != ISD::XOR &&
16524      Narrow->getOpcode() != ISD::AND &&
16525      Narrow->getOpcode() != ISD::OR)
16526    return SDValue();
16527
16528  SDValue N0  = Narrow->getOperand(0);
16529  SDValue N1  = Narrow->getOperand(1);
16530  SDLoc DL(Narrow);
16531
16532  // The Left side has to be a trunc.
16533  if (N0.getOpcode() != ISD::TRUNCATE)
16534    return SDValue();
16535
16536  // The type of the truncated inputs.
16537  EVT WideVT = N0->getOperand(0)->getValueType(0);
16538  if (WideVT != VT)
16539    return SDValue();
16540
16541  // The right side has to be a 'trunc' or a constant vector.
16542  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
16543  bool RHSConst = (isSplatVector(N1.getNode()) &&
16544                   isa<ConstantSDNode>(N1->getOperand(0)));
16545  if (!RHSTrunc && !RHSConst)
16546    return SDValue();
16547
16548  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16549
16550  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
16551    return SDValue();
16552
16553  // Set N0 and N1 to hold the inputs to the new wide operation.
16554  N0 = N0->getOperand(0);
16555  if (RHSConst) {
16556    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
16557                     N1->getOperand(0));
16558    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
16559    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
16560  } else if (RHSTrunc) {
16561    N1 = N1->getOperand(0);
16562  }
16563
16564  // Generate the wide operation.
16565  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
16566  unsigned Opcode = N->getOpcode();
16567  switch (Opcode) {
16568  case ISD::ANY_EXTEND:
16569    return Op;
16570  case ISD::ZERO_EXTEND: {
16571    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
16572    APInt Mask = APInt::getAllOnesValue(InBits);
16573    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
16574    return DAG.getNode(ISD::AND, DL, VT,
16575                       Op, DAG.getConstant(Mask, VT));
16576  }
16577  case ISD::SIGN_EXTEND:
16578    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
16579                       Op, DAG.getValueType(NarrowVT));
16580  default:
16581    llvm_unreachable("Unexpected opcode");
16582  }
16583}
16584
16585static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
16586                                 TargetLowering::DAGCombinerInfo &DCI,
16587                                 const X86Subtarget *Subtarget) {
16588  EVT VT = N->getValueType(0);
16589  if (DCI.isBeforeLegalizeOps())
16590    return SDValue();
16591
16592  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
16593  if (R.getNode())
16594    return R;
16595
16596  // Create BLSI, and BLSR instructions
16597  // BLSI is X & (-X)
16598  // BLSR is X & (X-1)
16599  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
16600    SDValue N0 = N->getOperand(0);
16601    SDValue N1 = N->getOperand(1);
16602    SDLoc DL(N);
16603
16604    // Check LHS for neg
16605    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
16606        isZero(N0.getOperand(0)))
16607      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
16608
16609    // Check RHS for neg
16610    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
16611        isZero(N1.getOperand(0)))
16612      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
16613
16614    // Check LHS for X-1
16615    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
16616        isAllOnes(N0.getOperand(1)))
16617      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
16618
16619    // Check RHS for X-1
16620    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
16621        isAllOnes(N1.getOperand(1)))
16622      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
16623
16624    return SDValue();
16625  }
16626
16627  // Want to form ANDNP nodes:
16628  // 1) In the hopes of then easily combining them with OR and AND nodes
16629  //    to form PBLEND/PSIGN.
16630  // 2) To match ANDN packed intrinsics
16631  if (VT != MVT::v2i64 && VT != MVT::v4i64)
16632    return SDValue();
16633
16634  SDValue N0 = N->getOperand(0);
16635  SDValue N1 = N->getOperand(1);
16636  SDLoc DL(N);
16637
16638  // Check LHS for vnot
16639  if (N0.getOpcode() == ISD::XOR &&
16640      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
16641      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
16642    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
16643
16644  // Check RHS for vnot
16645  if (N1.getOpcode() == ISD::XOR &&
16646      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
16647      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
16648    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
16649
16650  return SDValue();
16651}
16652
16653static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
16654                                TargetLowering::DAGCombinerInfo &DCI,
16655                                const X86Subtarget *Subtarget) {
16656  EVT VT = N->getValueType(0);
16657  if (DCI.isBeforeLegalizeOps())
16658    return SDValue();
16659
16660  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
16661  if (R.getNode())
16662    return R;
16663
16664  SDValue N0 = N->getOperand(0);
16665  SDValue N1 = N->getOperand(1);
16666
16667  // look for psign/blend
16668  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
16669    if (!Subtarget->hasSSSE3() ||
16670        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
16671      return SDValue();
16672
16673    // Canonicalize pandn to RHS
16674    if (N0.getOpcode() == X86ISD::ANDNP)
16675      std::swap(N0, N1);
16676    // or (and (m, y), (pandn m, x))
16677    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
16678      SDValue Mask = N1.getOperand(0);
16679      SDValue X    = N1.getOperand(1);
16680      SDValue Y;
16681      if (N0.getOperand(0) == Mask)
16682        Y = N0.getOperand(1);
16683      if (N0.getOperand(1) == Mask)
16684        Y = N0.getOperand(0);
16685
16686      // Check to see if the mask appeared in both the AND and ANDNP and
16687      if (!Y.getNode())
16688        return SDValue();
16689
16690      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
16691      // Look through mask bitcast.
16692      if (Mask.getOpcode() == ISD::BITCAST)
16693        Mask = Mask.getOperand(0);
16694      if (X.getOpcode() == ISD::BITCAST)
16695        X = X.getOperand(0);
16696      if (Y.getOpcode() == ISD::BITCAST)
16697        Y = Y.getOperand(0);
16698
16699      EVT MaskVT = Mask.getValueType();
16700
16701      // Validate that the Mask operand is a vector sra node.
16702      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
16703      // there is no psrai.b
16704      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
16705      unsigned SraAmt = ~0;
16706      if (Mask.getOpcode() == ISD::SRA) {
16707        SDValue Amt = Mask.getOperand(1);
16708        if (isSplatVector(Amt.getNode())) {
16709          SDValue SclrAmt = Amt->getOperand(0);
16710          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
16711            SraAmt = C->getZExtValue();
16712        }
16713      } else if (Mask.getOpcode() == X86ISD::VSRAI) {
16714        SDValue SraC = Mask.getOperand(1);
16715        SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
16716      }
16717      if ((SraAmt + 1) != EltBits)
16718        return SDValue();
16719
16720      SDLoc DL(N);
16721
16722      // Now we know we at least have a plendvb with the mask val.  See if
16723      // we can form a psignb/w/d.
16724      // psign = x.type == y.type == mask.type && y = sub(0, x);
16725      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
16726          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
16727          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
16728        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
16729               "Unsupported VT for PSIGN");
16730        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
16731        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
16732      }
16733      // PBLENDVB only available on SSE 4.1
16734      if (!Subtarget->hasSSE41())
16735        return SDValue();
16736
16737      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
16738
16739      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
16740      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
16741      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
16742      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
16743      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
16744    }
16745  }
16746
16747  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
16748    return SDValue();
16749
16750  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
16751  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
16752    std::swap(N0, N1);
16753  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
16754    return SDValue();
16755  if (!N0.hasOneUse() || !N1.hasOneUse())
16756    return SDValue();
16757
16758  SDValue ShAmt0 = N0.getOperand(1);
16759  if (ShAmt0.getValueType() != MVT::i8)
16760    return SDValue();
16761  SDValue ShAmt1 = N1.getOperand(1);
16762  if (ShAmt1.getValueType() != MVT::i8)
16763    return SDValue();
16764  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
16765    ShAmt0 = ShAmt0.getOperand(0);
16766  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
16767    ShAmt1 = ShAmt1.getOperand(0);
16768
16769  SDLoc DL(N);
16770  unsigned Opc = X86ISD::SHLD;
16771  SDValue Op0 = N0.getOperand(0);
16772  SDValue Op1 = N1.getOperand(0);
16773  if (ShAmt0.getOpcode() == ISD::SUB) {
16774    Opc = X86ISD::SHRD;
16775    std::swap(Op0, Op1);
16776    std::swap(ShAmt0, ShAmt1);
16777  }
16778
16779  unsigned Bits = VT.getSizeInBits();
16780  if (ShAmt1.getOpcode() == ISD::SUB) {
16781    SDValue Sum = ShAmt1.getOperand(0);
16782    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
16783      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
16784      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
16785        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
16786      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
16787        return DAG.getNode(Opc, DL, VT,
16788                           Op0, Op1,
16789                           DAG.getNode(ISD::TRUNCATE, DL,
16790                                       MVT::i8, ShAmt0));
16791    }
16792  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
16793    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
16794    if (ShAmt0C &&
16795        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
16796      return DAG.getNode(Opc, DL, VT,
16797                         N0.getOperand(0), N1.getOperand(0),
16798                         DAG.getNode(ISD::TRUNCATE, DL,
16799                                       MVT::i8, ShAmt0));
16800  }
16801
16802  return SDValue();
16803}
16804
16805// Generate NEG and CMOV for integer abs.
16806static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
16807  EVT VT = N->getValueType(0);
16808
16809  // Since X86 does not have CMOV for 8-bit integer, we don't convert
16810  // 8-bit integer abs to NEG and CMOV.
16811  if (VT.isInteger() && VT.getSizeInBits() == 8)
16812    return SDValue();
16813
16814  SDValue N0 = N->getOperand(0);
16815  SDValue N1 = N->getOperand(1);
16816  SDLoc DL(N);
16817
16818  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
16819  // and change it to SUB and CMOV.
16820  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
16821      N0.getOpcode() == ISD::ADD &&
16822      N0.getOperand(1) == N1 &&
16823      N1.getOpcode() == ISD::SRA &&
16824      N1.getOperand(0) == N0.getOperand(0))
16825    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
16826      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
16827        // Generate SUB & CMOV.
16828        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
16829                                  DAG.getConstant(0, VT), N0.getOperand(0));
16830
16831        SDValue Ops[] = { N0.getOperand(0), Neg,
16832                          DAG.getConstant(X86::COND_GE, MVT::i8),
16833                          SDValue(Neg.getNode(), 1) };
16834        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
16835                           Ops, array_lengthof(Ops));
16836      }
16837  return SDValue();
16838}
16839
16840// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
16841static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
16842                                 TargetLowering::DAGCombinerInfo &DCI,
16843                                 const X86Subtarget *Subtarget) {
16844  EVT VT = N->getValueType(0);
16845  if (DCI.isBeforeLegalizeOps())
16846    return SDValue();
16847
16848  if (Subtarget->hasCMov()) {
16849    SDValue RV = performIntegerAbsCombine(N, DAG);
16850    if (RV.getNode())
16851      return RV;
16852  }
16853
16854  // Try forming BMI if it is available.
16855  if (!Subtarget->hasBMI())
16856    return SDValue();
16857
16858  if (VT != MVT::i32 && VT != MVT::i64)
16859    return SDValue();
16860
16861  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
16862
16863  // Create BLSMSK instructions by finding X ^ (X-1)
16864  SDValue N0 = N->getOperand(0);
16865  SDValue N1 = N->getOperand(1);
16866  SDLoc DL(N);
16867
16868  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
16869      isAllOnes(N0.getOperand(1)))
16870    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
16871
16872  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
16873      isAllOnes(N1.getOperand(1)))
16874    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
16875
16876  return SDValue();
16877}
16878
16879/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
16880static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
16881                                  TargetLowering::DAGCombinerInfo &DCI,
16882                                  const X86Subtarget *Subtarget) {
16883  LoadSDNode *Ld = cast<LoadSDNode>(N);
16884  EVT RegVT = Ld->getValueType(0);
16885  EVT MemVT = Ld->getMemoryVT();
16886  SDLoc dl(Ld);
16887  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16888  unsigned RegSz = RegVT.getSizeInBits();
16889
16890  // On Sandybridge unaligned 256bit loads are inefficient.
16891  ISD::LoadExtType Ext = Ld->getExtensionType();
16892  unsigned Alignment = Ld->getAlignment();
16893  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
16894  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
16895      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
16896    unsigned NumElems = RegVT.getVectorNumElements();
16897    if (NumElems < 2)
16898      return SDValue();
16899
16900    SDValue Ptr = Ld->getBasePtr();
16901    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
16902
16903    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16904                                  NumElems/2);
16905    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
16906                                Ld->getPointerInfo(), Ld->isVolatile(),
16907                                Ld->isNonTemporal(), Ld->isInvariant(),
16908                                Alignment);
16909    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16910    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
16911                                Ld->getPointerInfo(), Ld->isVolatile(),
16912                                Ld->isNonTemporal(), Ld->isInvariant(),
16913                                std::min(16U, Alignment));
16914    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16915                             Load1.getValue(1),
16916                             Load2.getValue(1));
16917
16918    SDValue NewVec = DAG.getUNDEF(RegVT);
16919    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
16920    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
16921    return DCI.CombineTo(N, NewVec, TF, true);
16922  }
16923
16924  // If this is a vector EXT Load then attempt to optimize it using a
16925  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
16926  // expansion is still better than scalar code.
16927  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
16928  // emit a shuffle and a arithmetic shift.
16929  // TODO: It is possible to support ZExt by zeroing the undef values
16930  // during the shuffle phase or after the shuffle.
16931  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
16932      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
16933    assert(MemVT != RegVT && "Cannot extend to the same type");
16934    assert(MemVT.isVector() && "Must load a vector from memory");
16935
16936    unsigned NumElems = RegVT.getVectorNumElements();
16937    unsigned MemSz = MemVT.getSizeInBits();
16938    assert(RegSz > MemSz && "Register size must be greater than the mem size");
16939
16940    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
16941      return SDValue();
16942
16943    // All sizes must be a power of two.
16944    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
16945      return SDValue();
16946
16947    // Attempt to load the original value using scalar loads.
16948    // Find the largest scalar type that divides the total loaded size.
16949    MVT SclrLoadTy = MVT::i8;
16950    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
16951         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
16952      MVT Tp = (MVT::SimpleValueType)tp;
16953      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16954        SclrLoadTy = Tp;
16955      }
16956    }
16957
16958    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16959    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16960        (64 <= MemSz))
16961      SclrLoadTy = MVT::f64;
16962
16963    // Calculate the number of scalar loads that we need to perform
16964    // in order to load our vector from memory.
16965    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16966    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
16967      return SDValue();
16968
16969    unsigned loadRegZize = RegSz;
16970    if (Ext == ISD::SEXTLOAD && RegSz == 256)
16971      loadRegZize /= 2;
16972
16973    // Represent our vector as a sequence of elements which are the
16974    // largest scalar that we can load.
16975    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
16976      loadRegZize/SclrLoadTy.getSizeInBits());
16977
16978    // Represent the data using the same element type that is stored in
16979    // memory. In practice, we ''widen'' MemVT.
16980    EVT WideVecVT =
16981          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16982                       loadRegZize/MemVT.getScalarType().getSizeInBits());
16983
16984    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16985      "Invalid vector type");
16986
16987    // We can't shuffle using an illegal type.
16988    if (!TLI.isTypeLegal(WideVecVT))
16989      return SDValue();
16990
16991    SmallVector<SDValue, 8> Chains;
16992    SDValue Ptr = Ld->getBasePtr();
16993    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
16994                                        TLI.getPointerTy());
16995    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16996
16997    for (unsigned i = 0; i < NumLoads; ++i) {
16998      // Perform a single load.
16999      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
17000                                       Ptr, Ld->getPointerInfo(),
17001                                       Ld->isVolatile(), Ld->isNonTemporal(),
17002                                       Ld->isInvariant(), Ld->getAlignment());
17003      Chains.push_back(ScalarLoad.getValue(1));
17004      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
17005      // another round of DAGCombining.
17006      if (i == 0)
17007        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
17008      else
17009        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
17010                          ScalarLoad, DAG.getIntPtrConstant(i));
17011
17012      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17013    }
17014
17015    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
17016                               Chains.size());
17017
17018    // Bitcast the loaded value to a vector of the original element type, in
17019    // the size of the target vector type.
17020    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
17021    unsigned SizeRatio = RegSz/MemSz;
17022
17023    if (Ext == ISD::SEXTLOAD) {
17024      // If we have SSE4.1 we can directly emit a VSEXT node.
17025      if (Subtarget->hasSSE41()) {
17026        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
17027        return DCI.CombineTo(N, Sext, TF, true);
17028      }
17029
17030      // Otherwise we'll shuffle the small elements in the high bits of the
17031      // larger type and perform an arithmetic shift. If the shift is not legal
17032      // it's better to scalarize.
17033      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
17034        return SDValue();
17035
17036      // Redistribute the loaded elements into the different locations.
17037      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
17038      for (unsigned i = 0; i != NumElems; ++i)
17039        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
17040
17041      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17042                                           DAG.getUNDEF(WideVecVT),
17043                                           &ShuffleVec[0]);
17044
17045      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
17046
17047      // Build the arithmetic shift.
17048      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
17049                     MemVT.getVectorElementType().getSizeInBits();
17050      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
17051                          DAG.getConstant(Amt, RegVT));
17052
17053      return DCI.CombineTo(N, Shuff, TF, true);
17054    }
17055
17056    // Redistribute the loaded elements into the different locations.
17057    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
17058    for (unsigned i = 0; i != NumElems; ++i)
17059      ShuffleVec[i*SizeRatio] = i;
17060
17061    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17062                                         DAG.getUNDEF(WideVecVT),
17063                                         &ShuffleVec[0]);
17064
17065    // Bitcast to the requested type.
17066    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
17067    // Replace the original load with the new sequence
17068    // and return the new chain.
17069    return DCI.CombineTo(N, Shuff, TF, true);
17070  }
17071
17072  return SDValue();
17073}
17074
17075/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
17076static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
17077                                   const X86Subtarget *Subtarget) {
17078  StoreSDNode *St = cast<StoreSDNode>(N);
17079  EVT VT = St->getValue().getValueType();
17080  EVT StVT = St->getMemoryVT();
17081  SDLoc dl(St);
17082  SDValue StoredVal = St->getOperand(1);
17083  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17084
17085  // If we are saving a concatenation of two XMM registers, perform two stores.
17086  // On Sandy Bridge, 256-bit memory operations are executed by two
17087  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
17088  // memory  operation.
17089  unsigned Alignment = St->getAlignment();
17090  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
17091  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
17092      StVT == VT && !IsAligned) {
17093    unsigned NumElems = VT.getVectorNumElements();
17094    if (NumElems < 2)
17095      return SDValue();
17096
17097    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
17098    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
17099
17100    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
17101    SDValue Ptr0 = St->getBasePtr();
17102    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
17103
17104    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
17105                                St->getPointerInfo(), St->isVolatile(),
17106                                St->isNonTemporal(), Alignment);
17107    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
17108                                St->getPointerInfo(), St->isVolatile(),
17109                                St->isNonTemporal(),
17110                                std::min(16U, Alignment));
17111    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
17112  }
17113
17114  // Optimize trunc store (of multiple scalars) to shuffle and store.
17115  // First, pack all of the elements in one place. Next, store to memory
17116  // in fewer chunks.
17117  if (St->isTruncatingStore() && VT.isVector()) {
17118    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17119    unsigned NumElems = VT.getVectorNumElements();
17120    assert(StVT != VT && "Cannot truncate to the same type");
17121    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
17122    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
17123
17124    // From, To sizes and ElemCount must be pow of two
17125    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
17126    // We are going to use the original vector elt for storing.
17127    // Accumulated smaller vector elements must be a multiple of the store size.
17128    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
17129
17130    unsigned SizeRatio  = FromSz / ToSz;
17131
17132    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
17133
17134    // Create a type on which we perform the shuffle
17135    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
17136            StVT.getScalarType(), NumElems*SizeRatio);
17137
17138    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
17139
17140    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
17141    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
17142    for (unsigned i = 0; i != NumElems; ++i)
17143      ShuffleVec[i] = i * SizeRatio;
17144
17145    // Can't shuffle using an illegal type.
17146    if (!TLI.isTypeLegal(WideVecVT))
17147      return SDValue();
17148
17149    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
17150                                         DAG.getUNDEF(WideVecVT),
17151                                         &ShuffleVec[0]);
17152    // At this point all of the data is stored at the bottom of the
17153    // register. We now need to save it to mem.
17154
17155    // Find the largest store unit
17156    MVT StoreType = MVT::i8;
17157    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
17158         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
17159      MVT Tp = (MVT::SimpleValueType)tp;
17160      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
17161        StoreType = Tp;
17162    }
17163
17164    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
17165    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
17166        (64 <= NumElems * ToSz))
17167      StoreType = MVT::f64;
17168
17169    // Bitcast the original vector into a vector of store-size units
17170    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
17171            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
17172    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
17173    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
17174    SmallVector<SDValue, 8> Chains;
17175    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
17176                                        TLI.getPointerTy());
17177    SDValue Ptr = St->getBasePtr();
17178
17179    // Perform one or more big stores into memory.
17180    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
17181      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
17182                                   StoreType, ShuffWide,
17183                                   DAG.getIntPtrConstant(i));
17184      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
17185                                St->getPointerInfo(), St->isVolatile(),
17186                                St->isNonTemporal(), St->getAlignment());
17187      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17188      Chains.push_back(Ch);
17189    }
17190
17191    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
17192                               Chains.size());
17193  }
17194
17195  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
17196  // the FP state in cases where an emms may be missing.
17197  // A preferable solution to the general problem is to figure out the right
17198  // places to insert EMMS.  This qualifies as a quick hack.
17199
17200  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
17201  if (VT.getSizeInBits() != 64)
17202    return SDValue();
17203
17204  const Function *F = DAG.getMachineFunction().getFunction();
17205  bool NoImplicitFloatOps = F->getAttributes().
17206    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
17207  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
17208                     && Subtarget->hasSSE2();
17209  if ((VT.isVector() ||
17210       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
17211      isa<LoadSDNode>(St->getValue()) &&
17212      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
17213      St->getChain().hasOneUse() && !St->isVolatile()) {
17214    SDNode* LdVal = St->getValue().getNode();
17215    LoadSDNode *Ld = 0;
17216    int TokenFactorIndex = -1;
17217    SmallVector<SDValue, 8> Ops;
17218    SDNode* ChainVal = St->getChain().getNode();
17219    // Must be a store of a load.  We currently handle two cases:  the load
17220    // is a direct child, and it's under an intervening TokenFactor.  It is
17221    // possible to dig deeper under nested TokenFactors.
17222    if (ChainVal == LdVal)
17223      Ld = cast<LoadSDNode>(St->getChain());
17224    else if (St->getValue().hasOneUse() &&
17225             ChainVal->getOpcode() == ISD::TokenFactor) {
17226      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
17227        if (ChainVal->getOperand(i).getNode() == LdVal) {
17228          TokenFactorIndex = i;
17229          Ld = cast<LoadSDNode>(St->getValue());
17230        } else
17231          Ops.push_back(ChainVal->getOperand(i));
17232      }
17233    }
17234
17235    if (!Ld || !ISD::isNormalLoad(Ld))
17236      return SDValue();
17237
17238    // If this is not the MMX case, i.e. we are just turning i64 load/store
17239    // into f64 load/store, avoid the transformation if there are multiple
17240    // uses of the loaded value.
17241    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
17242      return SDValue();
17243
17244    SDLoc LdDL(Ld);
17245    SDLoc StDL(N);
17246    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
17247    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
17248    // pair instead.
17249    if (Subtarget->is64Bit() || F64IsLegal) {
17250      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
17251      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
17252                                  Ld->getPointerInfo(), Ld->isVolatile(),
17253                                  Ld->isNonTemporal(), Ld->isInvariant(),
17254                                  Ld->getAlignment());
17255      SDValue NewChain = NewLd.getValue(1);
17256      if (TokenFactorIndex != -1) {
17257        Ops.push_back(NewChain);
17258        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
17259                               Ops.size());
17260      }
17261      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
17262                          St->getPointerInfo(),
17263                          St->isVolatile(), St->isNonTemporal(),
17264                          St->getAlignment());
17265    }
17266
17267    // Otherwise, lower to two pairs of 32-bit loads / stores.
17268    SDValue LoAddr = Ld->getBasePtr();
17269    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
17270                                 DAG.getConstant(4, MVT::i32));
17271
17272    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
17273                               Ld->getPointerInfo(),
17274                               Ld->isVolatile(), Ld->isNonTemporal(),
17275                               Ld->isInvariant(), Ld->getAlignment());
17276    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
17277                               Ld->getPointerInfo().getWithOffset(4),
17278                               Ld->isVolatile(), Ld->isNonTemporal(),
17279                               Ld->isInvariant(),
17280                               MinAlign(Ld->getAlignment(), 4));
17281
17282    SDValue NewChain = LoLd.getValue(1);
17283    if (TokenFactorIndex != -1) {
17284      Ops.push_back(LoLd);
17285      Ops.push_back(HiLd);
17286      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
17287                             Ops.size());
17288    }
17289
17290    LoAddr = St->getBasePtr();
17291    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
17292                         DAG.getConstant(4, MVT::i32));
17293
17294    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
17295                                St->getPointerInfo(),
17296                                St->isVolatile(), St->isNonTemporal(),
17297                                St->getAlignment());
17298    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
17299                                St->getPointerInfo().getWithOffset(4),
17300                                St->isVolatile(),
17301                                St->isNonTemporal(),
17302                                MinAlign(St->getAlignment(), 4));
17303    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
17304  }
17305  return SDValue();
17306}
17307
17308/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
17309/// and return the operands for the horizontal operation in LHS and RHS.  A
17310/// horizontal operation performs the binary operation on successive elements
17311/// of its first operand, then on successive elements of its second operand,
17312/// returning the resulting values in a vector.  For example, if
17313///   A = < float a0, float a1, float a2, float a3 >
17314/// and
17315///   B = < float b0, float b1, float b2, float b3 >
17316/// then the result of doing a horizontal operation on A and B is
17317///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
17318/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
17319/// A horizontal-op B, for some already available A and B, and if so then LHS is
17320/// set to A, RHS to B, and the routine returns 'true'.
17321/// Note that the binary operation should have the property that if one of the
17322/// operands is UNDEF then the result is UNDEF.
17323static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
17324  // Look for the following pattern: if
17325  //   A = < float a0, float a1, float a2, float a3 >
17326  //   B = < float b0, float b1, float b2, float b3 >
17327  // and
17328  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
17329  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
17330  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
17331  // which is A horizontal-op B.
17332
17333  // At least one of the operands should be a vector shuffle.
17334  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
17335      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
17336    return false;
17337
17338  EVT VT = LHS.getValueType();
17339
17340  assert((VT.is128BitVector() || VT.is256BitVector()) &&
17341         "Unsupported vector type for horizontal add/sub");
17342
17343  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
17344  // operate independently on 128-bit lanes.
17345  unsigned NumElts = VT.getVectorNumElements();
17346  unsigned NumLanes = VT.getSizeInBits()/128;
17347  unsigned NumLaneElts = NumElts / NumLanes;
17348  assert((NumLaneElts % 2 == 0) &&
17349         "Vector type should have an even number of elements in each lane");
17350  unsigned HalfLaneElts = NumLaneElts/2;
17351
17352  // View LHS in the form
17353  //   LHS = VECTOR_SHUFFLE A, B, LMask
17354  // If LHS is not a shuffle then pretend it is the shuffle
17355  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
17356  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
17357  // type VT.
17358  SDValue A, B;
17359  SmallVector<int, 16> LMask(NumElts);
17360  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
17361    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
17362      A = LHS.getOperand(0);
17363    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
17364      B = LHS.getOperand(1);
17365    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
17366    std::copy(Mask.begin(), Mask.end(), LMask.begin());
17367  } else {
17368    if (LHS.getOpcode() != ISD::UNDEF)
17369      A = LHS;
17370    for (unsigned i = 0; i != NumElts; ++i)
17371      LMask[i] = i;
17372  }
17373
17374  // Likewise, view RHS in the form
17375  //   RHS = VECTOR_SHUFFLE C, D, RMask
17376  SDValue C, D;
17377  SmallVector<int, 16> RMask(NumElts);
17378  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
17379    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
17380      C = RHS.getOperand(0);
17381    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
17382      D = RHS.getOperand(1);
17383    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
17384    std::copy(Mask.begin(), Mask.end(), RMask.begin());
17385  } else {
17386    if (RHS.getOpcode() != ISD::UNDEF)
17387      C = RHS;
17388    for (unsigned i = 0; i != NumElts; ++i)
17389      RMask[i] = i;
17390  }
17391
17392  // Check that the shuffles are both shuffling the same vectors.
17393  if (!(A == C && B == D) && !(A == D && B == C))
17394    return false;
17395
17396  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
17397  if (!A.getNode() && !B.getNode())
17398    return false;
17399
17400  // If A and B occur in reverse order in RHS, then "swap" them (which means
17401  // rewriting the mask).
17402  if (A != C)
17403    CommuteVectorShuffleMask(RMask, NumElts);
17404
17405  // At this point LHS and RHS are equivalent to
17406  //   LHS = VECTOR_SHUFFLE A, B, LMask
17407  //   RHS = VECTOR_SHUFFLE A, B, RMask
17408  // Check that the masks correspond to performing a horizontal operation.
17409  for (unsigned i = 0; i != NumElts; ++i) {
17410    int LIdx = LMask[i], RIdx = RMask[i];
17411
17412    // Ignore any UNDEF components.
17413    if (LIdx < 0 || RIdx < 0 ||
17414        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
17415        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
17416      continue;
17417
17418    // Check that successive elements are being operated on.  If not, this is
17419    // not a horizontal operation.
17420    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
17421    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
17422    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
17423    if (!(LIdx == Index && RIdx == Index + 1) &&
17424        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
17425      return false;
17426  }
17427
17428  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
17429  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
17430  return true;
17431}
17432
17433/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
17434static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
17435                                  const X86Subtarget *Subtarget) {
17436  EVT VT = N->getValueType(0);
17437  SDValue LHS = N->getOperand(0);
17438  SDValue RHS = N->getOperand(1);
17439
17440  // Try to synthesize horizontal adds from adds of shuffles.
17441  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
17442       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
17443      isHorizontalBinOp(LHS, RHS, true))
17444    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
17445  return SDValue();
17446}
17447
17448/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
17449static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
17450                                  const X86Subtarget *Subtarget) {
17451  EVT VT = N->getValueType(0);
17452  SDValue LHS = N->getOperand(0);
17453  SDValue RHS = N->getOperand(1);
17454
17455  // Try to synthesize horizontal subs from subs of shuffles.
17456  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
17457       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
17458      isHorizontalBinOp(LHS, RHS, false))
17459    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
17460  return SDValue();
17461}
17462
17463/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
17464/// X86ISD::FXOR nodes.
17465static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
17466  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
17467  // F[X]OR(0.0, x) -> x
17468  // F[X]OR(x, 0.0) -> x
17469  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
17470    if (C->getValueAPF().isPosZero())
17471      return N->getOperand(1);
17472  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
17473    if (C->getValueAPF().isPosZero())
17474      return N->getOperand(0);
17475  return SDValue();
17476}
17477
17478/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
17479/// X86ISD::FMAX nodes.
17480static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
17481  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
17482
17483  // Only perform optimizations if UnsafeMath is used.
17484  if (!DAG.getTarget().Options.UnsafeFPMath)
17485    return SDValue();
17486
17487  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
17488  // into FMINC and FMAXC, which are Commutative operations.
17489  unsigned NewOp = 0;
17490  switch (N->getOpcode()) {
17491    default: llvm_unreachable("unknown opcode");
17492    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
17493    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
17494  }
17495
17496  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
17497                     N->getOperand(0), N->getOperand(1));
17498}
17499
17500/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
17501static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
17502  // FAND(0.0, x) -> 0.0
17503  // FAND(x, 0.0) -> 0.0
17504  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
17505    if (C->getValueAPF().isPosZero())
17506      return N->getOperand(0);
17507  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
17508    if (C->getValueAPF().isPosZero())
17509      return N->getOperand(1);
17510  return SDValue();
17511}
17512
17513static SDValue PerformBTCombine(SDNode *N,
17514                                SelectionDAG &DAG,
17515                                TargetLowering::DAGCombinerInfo &DCI) {
17516  // BT ignores high bits in the bit index operand.
17517  SDValue Op1 = N->getOperand(1);
17518  if (Op1.hasOneUse()) {
17519    unsigned BitWidth = Op1.getValueSizeInBits();
17520    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
17521    APInt KnownZero, KnownOne;
17522    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
17523                                          !DCI.isBeforeLegalizeOps());
17524    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17525    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
17526        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
17527      DCI.CommitTargetLoweringOpt(TLO);
17528  }
17529  return SDValue();
17530}
17531
17532static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
17533  SDValue Op = N->getOperand(0);
17534  if (Op.getOpcode() == ISD::BITCAST)
17535    Op = Op.getOperand(0);
17536  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
17537  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
17538      VT.getVectorElementType().getSizeInBits() ==
17539      OpVT.getVectorElementType().getSizeInBits()) {
17540    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
17541  }
17542  return SDValue();
17543}
17544
17545static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
17546                                               const X86Subtarget *Subtarget) {
17547  EVT VT = N->getValueType(0);
17548  if (!VT.isVector())
17549    return SDValue();
17550
17551  SDValue N0 = N->getOperand(0);
17552  SDValue N1 = N->getOperand(1);
17553  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
17554  SDLoc dl(N);
17555
17556  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
17557  // both SSE and AVX2 since there is no sign-extended shift right
17558  // operation on a vector with 64-bit elements.
17559  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
17560  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
17561  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
17562      N0.getOpcode() == ISD::SIGN_EXTEND)) {
17563    SDValue N00 = N0.getOperand(0);
17564
17565    // EXTLOAD has a better solution on AVX2,
17566    // it may be replaced with X86ISD::VSEXT node.
17567    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
17568      if (!ISD::isNormalLoad(N00.getNode()))
17569        return SDValue();
17570
17571    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
17572        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
17573                                  N00, N1);
17574      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
17575    }
17576  }
17577  return SDValue();
17578}
17579
17580static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
17581                                  TargetLowering::DAGCombinerInfo &DCI,
17582                                  const X86Subtarget *Subtarget) {
17583  if (!DCI.isBeforeLegalizeOps())
17584    return SDValue();
17585
17586  if (!Subtarget->hasFp256())
17587    return SDValue();
17588
17589  EVT VT = N->getValueType(0);
17590  if (VT.isVector() && VT.getSizeInBits() == 256) {
17591    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
17592    if (R.getNode())
17593      return R;
17594  }
17595
17596  return SDValue();
17597}
17598
17599static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
17600                                 const X86Subtarget* Subtarget) {
17601  SDLoc dl(N);
17602  EVT VT = N->getValueType(0);
17603
17604  // Let legalize expand this if it isn't a legal type yet.
17605  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
17606    return SDValue();
17607
17608  EVT ScalarVT = VT.getScalarType();
17609  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
17610      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
17611    return SDValue();
17612
17613  SDValue A = N->getOperand(0);
17614  SDValue B = N->getOperand(1);
17615  SDValue C = N->getOperand(2);
17616
17617  bool NegA = (A.getOpcode() == ISD::FNEG);
17618  bool NegB = (B.getOpcode() == ISD::FNEG);
17619  bool NegC = (C.getOpcode() == ISD::FNEG);
17620
17621  // Negative multiplication when NegA xor NegB
17622  bool NegMul = (NegA != NegB);
17623  if (NegA)
17624    A = A.getOperand(0);
17625  if (NegB)
17626    B = B.getOperand(0);
17627  if (NegC)
17628    C = C.getOperand(0);
17629
17630  unsigned Opcode;
17631  if (!NegMul)
17632    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
17633  else
17634    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
17635
17636  return DAG.getNode(Opcode, dl, VT, A, B, C);
17637}
17638
17639static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
17640                                  TargetLowering::DAGCombinerInfo &DCI,
17641                                  const X86Subtarget *Subtarget) {
17642  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
17643  //           (and (i32 x86isd::setcc_carry), 1)
17644  // This eliminates the zext. This transformation is necessary because
17645  // ISD::SETCC is always legalized to i8.
17646  SDLoc dl(N);
17647  SDValue N0 = N->getOperand(0);
17648  EVT VT = N->getValueType(0);
17649
17650  if (N0.getOpcode() == ISD::AND &&
17651      N0.hasOneUse() &&
17652      N0.getOperand(0).hasOneUse()) {
17653    SDValue N00 = N0.getOperand(0);
17654    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
17655      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
17656      if (!C || C->getZExtValue() != 1)
17657        return SDValue();
17658      return DAG.getNode(ISD::AND, dl, VT,
17659                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
17660                                     N00.getOperand(0), N00.getOperand(1)),
17661                         DAG.getConstant(1, VT));
17662    }
17663  }
17664
17665  if (VT.is256BitVector()) {
17666    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
17667    if (R.getNode())
17668      return R;
17669  }
17670
17671  return SDValue();
17672}
17673
17674// Optimize x == -y --> x+y == 0
17675//          x != -y --> x+y != 0
17676static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
17677  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17678  SDValue LHS = N->getOperand(0);
17679  SDValue RHS = N->getOperand(1);
17680
17681  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
17682    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
17683      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
17684        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
17685                                   LHS.getValueType(), RHS, LHS.getOperand(1));
17686        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
17687                            addV, DAG.getConstant(0, addV.getValueType()), CC);
17688      }
17689  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
17690    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
17691      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
17692        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
17693                                   RHS.getValueType(), LHS, RHS.getOperand(1));
17694        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
17695                            addV, DAG.getConstant(0, addV.getValueType()), CC);
17696      }
17697  return SDValue();
17698}
17699
17700// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
17701// as "sbb reg,reg", since it can be extended without zext and produces
17702// an all-ones bit which is more useful than 0/1 in some cases.
17703static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
17704  return DAG.getNode(ISD::AND, DL, MVT::i8,
17705                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
17706                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
17707                     DAG.getConstant(1, MVT::i8));
17708}
17709
17710// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
17711static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
17712                                   TargetLowering::DAGCombinerInfo &DCI,
17713                                   const X86Subtarget *Subtarget) {
17714  SDLoc DL(N);
17715  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
17716  SDValue EFLAGS = N->getOperand(1);
17717
17718  if (CC == X86::COND_A) {
17719    // Try to convert COND_A into COND_B in an attempt to facilitate
17720    // materializing "setb reg".
17721    //
17722    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
17723    // cannot take an immediate as its first operand.
17724    //
17725    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
17726        EFLAGS.getValueType().isInteger() &&
17727        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
17728      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
17729                                   EFLAGS.getNode()->getVTList(),
17730                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
17731      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
17732      return MaterializeSETB(DL, NewEFLAGS, DAG);
17733    }
17734  }
17735
17736  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
17737  // a zext and produces an all-ones bit which is more useful than 0/1 in some
17738  // cases.
17739  if (CC == X86::COND_B)
17740    return MaterializeSETB(DL, EFLAGS, DAG);
17741
17742  SDValue Flags;
17743
17744  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
17745  if (Flags.getNode()) {
17746    SDValue Cond = DAG.getConstant(CC, MVT::i8);
17747    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
17748  }
17749
17750  return SDValue();
17751}
17752
17753// Optimize branch condition evaluation.
17754//
17755static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
17756                                    TargetLowering::DAGCombinerInfo &DCI,
17757                                    const X86Subtarget *Subtarget) {
17758  SDLoc DL(N);
17759  SDValue Chain = N->getOperand(0);
17760  SDValue Dest = N->getOperand(1);
17761  SDValue EFLAGS = N->getOperand(3);
17762  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
17763
17764  SDValue Flags;
17765
17766  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
17767  if (Flags.getNode()) {
17768    SDValue Cond = DAG.getConstant(CC, MVT::i8);
17769    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
17770                       Flags);
17771  }
17772
17773  return SDValue();
17774}
17775
17776static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
17777                                        const X86TargetLowering *XTLI) {
17778  SDValue Op0 = N->getOperand(0);
17779  EVT InVT = Op0->getValueType(0);
17780
17781  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
17782  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
17783    SDLoc dl(N);
17784    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
17785    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
17786    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
17787  }
17788
17789  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
17790  // a 32-bit target where SSE doesn't support i64->FP operations.
17791  if (Op0.getOpcode() == ISD::LOAD) {
17792    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
17793    EVT VT = Ld->getValueType(0);
17794    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
17795        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
17796        !XTLI->getSubtarget()->is64Bit() &&
17797        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
17798      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
17799                                          Ld->getChain(), Op0, DAG);
17800      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
17801      return FILDChain;
17802    }
17803  }
17804  return SDValue();
17805}
17806
17807// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
17808static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
17809                                 X86TargetLowering::DAGCombinerInfo &DCI) {
17810  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
17811  // the result is either zero or one (depending on the input carry bit).
17812  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
17813  if (X86::isZeroNode(N->getOperand(0)) &&
17814      X86::isZeroNode(N->getOperand(1)) &&
17815      // We don't have a good way to replace an EFLAGS use, so only do this when
17816      // dead right now.
17817      SDValue(N, 1).use_empty()) {
17818    SDLoc DL(N);
17819    EVT VT = N->getValueType(0);
17820    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
17821    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
17822                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
17823                                           DAG.getConstant(X86::COND_B,MVT::i8),
17824                                           N->getOperand(2)),
17825                               DAG.getConstant(1, VT));
17826    return DCI.CombineTo(N, Res1, CarryOut);
17827  }
17828
17829  return SDValue();
17830}
17831
17832// fold (add Y, (sete  X, 0)) -> adc  0, Y
17833//      (add Y, (setne X, 0)) -> sbb -1, Y
17834//      (sub (sete  X, 0), Y) -> sbb  0, Y
17835//      (sub (setne X, 0), Y) -> adc -1, Y
17836static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
17837  SDLoc DL(N);
17838
17839  // Look through ZExts.
17840  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
17841  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
17842    return SDValue();
17843
17844  SDValue SetCC = Ext.getOperand(0);
17845  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
17846    return SDValue();
17847
17848  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
17849  if (CC != X86::COND_E && CC != X86::COND_NE)
17850    return SDValue();
17851
17852  SDValue Cmp = SetCC.getOperand(1);
17853  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
17854      !X86::isZeroNode(Cmp.getOperand(1)) ||
17855      !Cmp.getOperand(0).getValueType().isInteger())
17856    return SDValue();
17857
17858  SDValue CmpOp0 = Cmp.getOperand(0);
17859  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
17860                               DAG.getConstant(1, CmpOp0.getValueType()));
17861
17862  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
17863  if (CC == X86::COND_NE)
17864    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
17865                       DL, OtherVal.getValueType(), OtherVal,
17866                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
17867  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
17868                     DL, OtherVal.getValueType(), OtherVal,
17869                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
17870}
17871
17872/// PerformADDCombine - Do target-specific dag combines on integer adds.
17873static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
17874                                 const X86Subtarget *Subtarget) {
17875  EVT VT = N->getValueType(0);
17876  SDValue Op0 = N->getOperand(0);
17877  SDValue Op1 = N->getOperand(1);
17878
17879  // Try to synthesize horizontal adds from adds of shuffles.
17880  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
17881       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
17882      isHorizontalBinOp(Op0, Op1, true))
17883    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
17884
17885  return OptimizeConditionalInDecrement(N, DAG);
17886}
17887
17888static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
17889                                 const X86Subtarget *Subtarget) {
17890  SDValue Op0 = N->getOperand(0);
17891  SDValue Op1 = N->getOperand(1);
17892
17893  // X86 can't encode an immediate LHS of a sub. See if we can push the
17894  // negation into a preceding instruction.
17895  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
17896    // If the RHS of the sub is a XOR with one use and a constant, invert the
17897    // immediate. Then add one to the LHS of the sub so we can turn
17898    // X-Y -> X+~Y+1, saving one register.
17899    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
17900        isa<ConstantSDNode>(Op1.getOperand(1))) {
17901      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
17902      EVT VT = Op0.getValueType();
17903      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
17904                                   Op1.getOperand(0),
17905                                   DAG.getConstant(~XorC, VT));
17906      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
17907                         DAG.getConstant(C->getAPIntValue()+1, VT));
17908    }
17909  }
17910
17911  // Try to synthesize horizontal adds from adds of shuffles.
17912  EVT VT = N->getValueType(0);
17913  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
17914       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
17915      isHorizontalBinOp(Op0, Op1, true))
17916    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
17917
17918  return OptimizeConditionalInDecrement(N, DAG);
17919}
17920
17921/// performVZEXTCombine - Performs build vector combines
17922static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
17923                                        TargetLowering::DAGCombinerInfo &DCI,
17924                                        const X86Subtarget *Subtarget) {
17925  // (vzext (bitcast (vzext (x)) -> (vzext x)
17926  SDValue In = N->getOperand(0);
17927  while (In.getOpcode() == ISD::BITCAST)
17928    In = In.getOperand(0);
17929
17930  if (In.getOpcode() != X86ISD::VZEXT)
17931    return SDValue();
17932
17933  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
17934                     In.getOperand(0));
17935}
17936
17937SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
17938                                             DAGCombinerInfo &DCI) const {
17939  SelectionDAG &DAG = DCI.DAG;
17940  switch (N->getOpcode()) {
17941  default: break;
17942  case ISD::EXTRACT_VECTOR_ELT:
17943    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
17944  case ISD::VSELECT:
17945  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
17946  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
17947  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
17948  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
17949  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
17950  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
17951  case ISD::SHL:
17952  case ISD::SRA:
17953  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
17954  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
17955  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
17956  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
17957  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
17958  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
17959  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
17960  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
17961  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
17962  case X86ISD::FXOR:
17963  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
17964  case X86ISD::FMIN:
17965  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
17966  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
17967  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
17968  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
17969  case ISD::ANY_EXTEND:
17970  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
17971  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
17972  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
17973  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
17974  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
17975  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
17976  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
17977  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
17978  case X86ISD::SHUFP:       // Handle all target specific shuffles
17979  case X86ISD::PALIGNR:
17980  case X86ISD::UNPCKH:
17981  case X86ISD::UNPCKL:
17982  case X86ISD::MOVHLPS:
17983  case X86ISD::MOVLHPS:
17984  case X86ISD::PSHUFD:
17985  case X86ISD::PSHUFHW:
17986  case X86ISD::PSHUFLW:
17987  case X86ISD::MOVSS:
17988  case X86ISD::MOVSD:
17989  case X86ISD::VPERMILP:
17990  case X86ISD::VPERM2X128:
17991  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
17992  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
17993  }
17994
17995  return SDValue();
17996}
17997
17998/// isTypeDesirableForOp - Return true if the target has native support for
17999/// the specified value type and it is 'desirable' to use the type for the
18000/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
18001/// instruction encodings are longer and some i16 instructions are slow.
18002bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
18003  if (!isTypeLegal(VT))
18004    return false;
18005  if (VT != MVT::i16)
18006    return true;
18007
18008  switch (Opc) {
18009  default:
18010    return true;
18011  case ISD::LOAD:
18012  case ISD::SIGN_EXTEND:
18013  case ISD::ZERO_EXTEND:
18014  case ISD::ANY_EXTEND:
18015  case ISD::SHL:
18016  case ISD::SRL:
18017  case ISD::SUB:
18018  case ISD::ADD:
18019  case ISD::MUL:
18020  case ISD::AND:
18021  case ISD::OR:
18022  case ISD::XOR:
18023    return false;
18024  }
18025}
18026
18027/// IsDesirableToPromoteOp - This method query the target whether it is
18028/// beneficial for dag combiner to promote the specified node. If true, it
18029/// should return the desired promotion type by reference.
18030bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
18031  EVT VT = Op.getValueType();
18032  if (VT != MVT::i16)
18033    return false;
18034
18035  bool Promote = false;
18036  bool Commute = false;
18037  switch (Op.getOpcode()) {
18038  default: break;
18039  case ISD::LOAD: {
18040    LoadSDNode *LD = cast<LoadSDNode>(Op);
18041    // If the non-extending load has a single use and it's not live out, then it
18042    // might be folded.
18043    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
18044                                                     Op.hasOneUse()*/) {
18045      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
18046             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
18047        // The only case where we'd want to promote LOAD (rather then it being
18048        // promoted as an operand is when it's only use is liveout.
18049        if (UI->getOpcode() != ISD::CopyToReg)
18050          return false;
18051      }
18052    }
18053    Promote = true;
18054    break;
18055  }
18056  case ISD::SIGN_EXTEND:
18057  case ISD::ZERO_EXTEND:
18058  case ISD::ANY_EXTEND:
18059    Promote = true;
18060    break;
18061  case ISD::SHL:
18062  case ISD::SRL: {
18063    SDValue N0 = Op.getOperand(0);
18064    // Look out for (store (shl (load), x)).
18065    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
18066      return false;
18067    Promote = true;
18068    break;
18069  }
18070  case ISD::ADD:
18071  case ISD::MUL:
18072  case ISD::AND:
18073  case ISD::OR:
18074  case ISD::XOR:
18075    Commute = true;
18076    // fallthrough
18077  case ISD::SUB: {
18078    SDValue N0 = Op.getOperand(0);
18079    SDValue N1 = Op.getOperand(1);
18080    if (!Commute && MayFoldLoad(N1))
18081      return false;
18082    // Avoid disabling potential load folding opportunities.
18083    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
18084      return false;
18085    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
18086      return false;
18087    Promote = true;
18088  }
18089  }
18090
18091  PVT = MVT::i32;
18092  return Promote;
18093}
18094
18095//===----------------------------------------------------------------------===//
18096//                           X86 Inline Assembly Support
18097//===----------------------------------------------------------------------===//
18098
18099namespace {
18100  // Helper to match a string separated by whitespace.
18101  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
18102    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
18103
18104    for (unsigned i = 0, e = args.size(); i != e; ++i) {
18105      StringRef piece(*args[i]);
18106      if (!s.startswith(piece)) // Check if the piece matches.
18107        return false;
18108
18109      s = s.substr(piece.size());
18110      StringRef::size_type pos = s.find_first_not_of(" \t");
18111      if (pos == 0) // We matched a prefix.
18112        return false;
18113
18114      s = s.substr(pos);
18115    }
18116
18117    return s.empty();
18118  }
18119  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
18120}
18121
18122bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
18123  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
18124
18125  std::string AsmStr = IA->getAsmString();
18126
18127  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
18128  if (!Ty || Ty->getBitWidth() % 16 != 0)
18129    return false;
18130
18131  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
18132  SmallVector<StringRef, 4> AsmPieces;
18133  SplitString(AsmStr, AsmPieces, ";\n");
18134
18135  switch (AsmPieces.size()) {
18136  default: return false;
18137  case 1:
18138    // FIXME: this should verify that we are targeting a 486 or better.  If not,
18139    // we will turn this bswap into something that will be lowered to logical
18140    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
18141    // lower so don't worry about this.
18142    // bswap $0
18143    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
18144        matchAsm(AsmPieces[0], "bswapl", "$0") ||
18145        matchAsm(AsmPieces[0], "bswapq", "$0") ||
18146        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
18147        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
18148        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
18149      // No need to check constraints, nothing other than the equivalent of
18150      // "=r,0" would be valid here.
18151      return IntrinsicLowering::LowerToByteSwap(CI);
18152    }
18153
18154    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
18155    if (CI->getType()->isIntegerTy(16) &&
18156        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
18157        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
18158         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
18159      AsmPieces.clear();
18160      const std::string &ConstraintsStr = IA->getConstraintString();
18161      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
18162      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
18163      if (AsmPieces.size() == 4 &&
18164          AsmPieces[0] == "~{cc}" &&
18165          AsmPieces[1] == "~{dirflag}" &&
18166          AsmPieces[2] == "~{flags}" &&
18167          AsmPieces[3] == "~{fpsr}")
18168      return IntrinsicLowering::LowerToByteSwap(CI);
18169    }
18170    break;
18171  case 3:
18172    if (CI->getType()->isIntegerTy(32) &&
18173        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
18174        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
18175        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
18176        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
18177      AsmPieces.clear();
18178      const std::string &ConstraintsStr = IA->getConstraintString();
18179      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
18180      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
18181      if (AsmPieces.size() == 4 &&
18182          AsmPieces[0] == "~{cc}" &&
18183          AsmPieces[1] == "~{dirflag}" &&
18184          AsmPieces[2] == "~{flags}" &&
18185          AsmPieces[3] == "~{fpsr}")
18186        return IntrinsicLowering::LowerToByteSwap(CI);
18187    }
18188
18189    if (CI->getType()->isIntegerTy(64)) {
18190      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
18191      if (Constraints.size() >= 2 &&
18192          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
18193          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
18194        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
18195        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
18196            matchAsm(AsmPieces[1], "bswap", "%edx") &&
18197            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
18198          return IntrinsicLowering::LowerToByteSwap(CI);
18199      }
18200    }
18201    break;
18202  }
18203  return false;
18204}
18205
18206/// getConstraintType - Given a constraint letter, return the type of
18207/// constraint it is for this target.
18208X86TargetLowering::ConstraintType
18209X86TargetLowering::getConstraintType(const std::string &Constraint) const {
18210  if (Constraint.size() == 1) {
18211    switch (Constraint[0]) {
18212    case 'R':
18213    case 'q':
18214    case 'Q':
18215    case 'f':
18216    case 't':
18217    case 'u':
18218    case 'y':
18219    case 'x':
18220    case 'Y':
18221    case 'l':
18222      return C_RegisterClass;
18223    case 'a':
18224    case 'b':
18225    case 'c':
18226    case 'd':
18227    case 'S':
18228    case 'D':
18229    case 'A':
18230      return C_Register;
18231    case 'I':
18232    case 'J':
18233    case 'K':
18234    case 'L':
18235    case 'M':
18236    case 'N':
18237    case 'G':
18238    case 'C':
18239    case 'e':
18240    case 'Z':
18241      return C_Other;
18242    default:
18243      break;
18244    }
18245  }
18246  return TargetLowering::getConstraintType(Constraint);
18247}
18248
18249/// Examine constraint type and operand type and determine a weight value.
18250/// This object must already have been set up with the operand type
18251/// and the current alternative constraint selected.
18252TargetLowering::ConstraintWeight
18253  X86TargetLowering::getSingleConstraintMatchWeight(
18254    AsmOperandInfo &info, const char *constraint) const {
18255  ConstraintWeight weight = CW_Invalid;
18256  Value *CallOperandVal = info.CallOperandVal;
18257    // If we don't have a value, we can't do a match,
18258    // but allow it at the lowest weight.
18259  if (CallOperandVal == NULL)
18260    return CW_Default;
18261  Type *type = CallOperandVal->getType();
18262  // Look at the constraint type.
18263  switch (*constraint) {
18264  default:
18265    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
18266  case 'R':
18267  case 'q':
18268  case 'Q':
18269  case 'a':
18270  case 'b':
18271  case 'c':
18272  case 'd':
18273  case 'S':
18274  case 'D':
18275  case 'A':
18276    if (CallOperandVal->getType()->isIntegerTy())
18277      weight = CW_SpecificReg;
18278    break;
18279  case 'f':
18280  case 't':
18281  case 'u':
18282    if (type->isFloatingPointTy())
18283      weight = CW_SpecificReg;
18284    break;
18285  case 'y':
18286    if (type->isX86_MMXTy() && Subtarget->hasMMX())
18287      weight = CW_SpecificReg;
18288    break;
18289  case 'x':
18290  case 'Y':
18291    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
18292        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
18293      weight = CW_Register;
18294    break;
18295  case 'I':
18296    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
18297      if (C->getZExtValue() <= 31)
18298        weight = CW_Constant;
18299    }
18300    break;
18301  case 'J':
18302    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18303      if (C->getZExtValue() <= 63)
18304        weight = CW_Constant;
18305    }
18306    break;
18307  case 'K':
18308    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18309      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
18310        weight = CW_Constant;
18311    }
18312    break;
18313  case 'L':
18314    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18315      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
18316        weight = CW_Constant;
18317    }
18318    break;
18319  case 'M':
18320    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18321      if (C->getZExtValue() <= 3)
18322        weight = CW_Constant;
18323    }
18324    break;
18325  case 'N':
18326    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18327      if (C->getZExtValue() <= 0xff)
18328        weight = CW_Constant;
18329    }
18330    break;
18331  case 'G':
18332  case 'C':
18333    if (dyn_cast<ConstantFP>(CallOperandVal)) {
18334      weight = CW_Constant;
18335    }
18336    break;
18337  case 'e':
18338    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18339      if ((C->getSExtValue() >= -0x80000000LL) &&
18340          (C->getSExtValue() <= 0x7fffffffLL))
18341        weight = CW_Constant;
18342    }
18343    break;
18344  case 'Z':
18345    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
18346      if (C->getZExtValue() <= 0xffffffff)
18347        weight = CW_Constant;
18348    }
18349    break;
18350  }
18351  return weight;
18352}
18353
18354/// LowerXConstraint - try to replace an X constraint, which matches anything,
18355/// with another that has more specific requirements based on the type of the
18356/// corresponding operand.
18357const char *X86TargetLowering::
18358LowerXConstraint(EVT ConstraintVT) const {
18359  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
18360  // 'f' like normal targets.
18361  if (ConstraintVT.isFloatingPoint()) {
18362    if (Subtarget->hasSSE2())
18363      return "Y";
18364    if (Subtarget->hasSSE1())
18365      return "x";
18366  }
18367
18368  return TargetLowering::LowerXConstraint(ConstraintVT);
18369}
18370
18371/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
18372/// vector.  If it is invalid, don't add anything to Ops.
18373void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18374                                                     std::string &Constraint,
18375                                                     std::vector<SDValue>&Ops,
18376                                                     SelectionDAG &DAG) const {
18377  SDValue Result(0, 0);
18378
18379  // Only support length 1 constraints for now.
18380  if (Constraint.length() > 1) return;
18381
18382  char ConstraintLetter = Constraint[0];
18383  switch (ConstraintLetter) {
18384  default: break;
18385  case 'I':
18386    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
18387      if (C->getZExtValue() <= 31) {
18388        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
18389        break;
18390      }
18391    }
18392    return;
18393  case 'J':
18394    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
18395      if (C->getZExtValue() <= 63) {
18396        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
18397        break;
18398      }
18399    }
18400    return;
18401  case 'K':
18402    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
18403      if (isInt<8>(C->getSExtValue())) {
18404        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
18405        break;
18406      }
18407    }
18408    return;
18409  case 'N':
18410    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
18411      if (C->getZExtValue() <= 255) {
18412        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
18413        break;
18414      }
18415    }
18416    return;
18417  case 'e': {
18418    // 32-bit signed value
18419    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
18420      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
18421                                           C->getSExtValue())) {
18422        // Widen to 64 bits here to get it sign extended.
18423        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
18424        break;
18425      }
18426    // FIXME gcc accepts some relocatable values here too, but only in certain
18427    // memory models; it's complicated.
18428    }
18429    return;
18430  }
18431  case 'Z': {
18432    // 32-bit unsigned value
18433    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
18434      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
18435                                           C->getZExtValue())) {
18436        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
18437        break;
18438      }
18439    }
18440    // FIXME gcc accepts some relocatable values here too, but only in certain
18441    // memory models; it's complicated.
18442    return;
18443  }
18444  case 'i': {
18445    // Literal immediates are always ok.
18446    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
18447      // Widen to 64 bits here to get it sign extended.
18448      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
18449      break;
18450    }
18451
18452    // In any sort of PIC mode addresses need to be computed at runtime by
18453    // adding in a register or some sort of table lookup.  These can't
18454    // be used as immediates.
18455    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
18456      return;
18457
18458    // If we are in non-pic codegen mode, we allow the address of a global (with
18459    // an optional displacement) to be used with 'i'.
18460    GlobalAddressSDNode *GA = 0;
18461    int64_t Offset = 0;
18462
18463    // Match either (GA), (GA+C), (GA+C1+C2), etc.
18464    while (1) {
18465      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
18466        Offset += GA->getOffset();
18467        break;
18468      } else if (Op.getOpcode() == ISD::ADD) {
18469        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
18470          Offset += C->getZExtValue();
18471          Op = Op.getOperand(0);
18472          continue;
18473        }
18474      } else if (Op.getOpcode() == ISD::SUB) {
18475        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
18476          Offset += -C->getZExtValue();
18477          Op = Op.getOperand(0);
18478          continue;
18479        }
18480      }
18481
18482      // Otherwise, this isn't something we can handle, reject it.
18483      return;
18484    }
18485
18486    const GlobalValue *GV = GA->getGlobal();
18487    // If we require an extra load to get this address, as in PIC mode, we
18488    // can't accept it.
18489    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
18490                                                        getTargetMachine())))
18491      return;
18492
18493    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
18494                                        GA->getValueType(0), Offset);
18495    break;
18496  }
18497  }
18498
18499  if (Result.getNode()) {
18500    Ops.push_back(Result);
18501    return;
18502  }
18503  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18504}
18505
18506std::pair<unsigned, const TargetRegisterClass*>
18507X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
18508                                                MVT VT) const {
18509  // First, see if this is a constraint that directly corresponds to an LLVM
18510  // register class.
18511  if (Constraint.size() == 1) {
18512    // GCC Constraint Letters
18513    switch (Constraint[0]) {
18514    default: break;
18515      // TODO: Slight differences here in allocation order and leaving
18516      // RIP in the class. Do they matter any more here than they do
18517      // in the normal allocation?
18518    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
18519      if (Subtarget->is64Bit()) {
18520        if (VT == MVT::i32 || VT == MVT::f32)
18521          return std::make_pair(0U, &X86::GR32RegClass);
18522        if (VT == MVT::i16)
18523          return std::make_pair(0U, &X86::GR16RegClass);
18524        if (VT == MVT::i8 || VT == MVT::i1)
18525          return std::make_pair(0U, &X86::GR8RegClass);
18526        if (VT == MVT::i64 || VT == MVT::f64)
18527          return std::make_pair(0U, &X86::GR64RegClass);
18528        break;
18529      }
18530      // 32-bit fallthrough
18531    case 'Q':   // Q_REGS
18532      if (VT == MVT::i32 || VT == MVT::f32)
18533        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
18534      if (VT == MVT::i16)
18535        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
18536      if (VT == MVT::i8 || VT == MVT::i1)
18537        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
18538      if (VT == MVT::i64)
18539        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
18540      break;
18541    case 'r':   // GENERAL_REGS
18542    case 'l':   // INDEX_REGS
18543      if (VT == MVT::i8 || VT == MVT::i1)
18544        return std::make_pair(0U, &X86::GR8RegClass);
18545      if (VT == MVT::i16)
18546        return std::make_pair(0U, &X86::GR16RegClass);
18547      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
18548        return std::make_pair(0U, &X86::GR32RegClass);
18549      return std::make_pair(0U, &X86::GR64RegClass);
18550    case 'R':   // LEGACY_REGS
18551      if (VT == MVT::i8 || VT == MVT::i1)
18552        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
18553      if (VT == MVT::i16)
18554        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
18555      if (VT == MVT::i32 || !Subtarget->is64Bit())
18556        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
18557      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
18558    case 'f':  // FP Stack registers.
18559      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
18560      // value to the correct fpstack register class.
18561      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
18562        return std::make_pair(0U, &X86::RFP32RegClass);
18563      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
18564        return std::make_pair(0U, &X86::RFP64RegClass);
18565      return std::make_pair(0U, &X86::RFP80RegClass);
18566    case 'y':   // MMX_REGS if MMX allowed.
18567      if (!Subtarget->hasMMX()) break;
18568      return std::make_pair(0U, &X86::VR64RegClass);
18569    case 'Y':   // SSE_REGS if SSE2 allowed
18570      if (!Subtarget->hasSSE2()) break;
18571      // FALL THROUGH.
18572    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
18573      if (!Subtarget->hasSSE1()) break;
18574
18575      switch (VT.SimpleTy) {
18576      default: break;
18577      // Scalar SSE types.
18578      case MVT::f32:
18579      case MVT::i32:
18580        return std::make_pair(0U, &X86::FR32RegClass);
18581      case MVT::f64:
18582      case MVT::i64:
18583        return std::make_pair(0U, &X86::FR64RegClass);
18584      // Vector types.
18585      case MVT::v16i8:
18586      case MVT::v8i16:
18587      case MVT::v4i32:
18588      case MVT::v2i64:
18589      case MVT::v4f32:
18590      case MVT::v2f64:
18591        return std::make_pair(0U, &X86::VR128RegClass);
18592      // AVX types.
18593      case MVT::v32i8:
18594      case MVT::v16i16:
18595      case MVT::v8i32:
18596      case MVT::v4i64:
18597      case MVT::v8f32:
18598      case MVT::v4f64:
18599        return std::make_pair(0U, &X86::VR256RegClass);
18600      case MVT::v8f64:
18601      case MVT::v16f32:
18602      case MVT::v16i32:
18603      case MVT::v8i64:
18604        return std::make_pair(0U, &X86::VR512RegClass);
18605      }
18606      break;
18607    }
18608  }
18609
18610  // Use the default implementation in TargetLowering to convert the register
18611  // constraint into a member of a register class.
18612  std::pair<unsigned, const TargetRegisterClass*> Res;
18613  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
18614
18615  // Not found as a standard register?
18616  if (Res.second == 0) {
18617    // Map st(0) -> st(7) -> ST0
18618    if (Constraint.size() == 7 && Constraint[0] == '{' &&
18619        tolower(Constraint[1]) == 's' &&
18620        tolower(Constraint[2]) == 't' &&
18621        Constraint[3] == '(' &&
18622        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
18623        Constraint[5] == ')' &&
18624        Constraint[6] == '}') {
18625
18626      Res.first = X86::ST0+Constraint[4]-'0';
18627      Res.second = &X86::RFP80RegClass;
18628      return Res;
18629    }
18630
18631    // GCC allows "st(0)" to be called just plain "st".
18632    if (StringRef("{st}").equals_lower(Constraint)) {
18633      Res.first = X86::ST0;
18634      Res.second = &X86::RFP80RegClass;
18635      return Res;
18636    }
18637
18638    // flags -> EFLAGS
18639    if (StringRef("{flags}").equals_lower(Constraint)) {
18640      Res.first = X86::EFLAGS;
18641      Res.second = &X86::CCRRegClass;
18642      return Res;
18643    }
18644
18645    // 'A' means EAX + EDX.
18646    if (Constraint == "A") {
18647      Res.first = X86::EAX;
18648      Res.second = &X86::GR32_ADRegClass;
18649      return Res;
18650    }
18651    return Res;
18652  }
18653
18654  // Otherwise, check to see if this is a register class of the wrong value
18655  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
18656  // turn into {ax},{dx}.
18657  if (Res.second->hasType(VT))
18658    return Res;   // Correct type already, nothing to do.
18659
18660  // All of the single-register GCC register classes map their values onto
18661  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
18662  // really want an 8-bit or 32-bit register, map to the appropriate register
18663  // class and return the appropriate register.
18664  if (Res.second == &X86::GR16RegClass) {
18665    if (VT == MVT::i8 || VT == MVT::i1) {
18666      unsigned DestReg = 0;
18667      switch (Res.first) {
18668      default: break;
18669      case X86::AX: DestReg = X86::AL; break;
18670      case X86::DX: DestReg = X86::DL; break;
18671      case X86::CX: DestReg = X86::CL; break;
18672      case X86::BX: DestReg = X86::BL; break;
18673      }
18674      if (DestReg) {
18675        Res.first = DestReg;
18676        Res.second = &X86::GR8RegClass;
18677      }
18678    } else if (VT == MVT::i32 || VT == MVT::f32) {
18679      unsigned DestReg = 0;
18680      switch (Res.first) {
18681      default: break;
18682      case X86::AX: DestReg = X86::EAX; break;
18683      case X86::DX: DestReg = X86::EDX; break;
18684      case X86::CX: DestReg = X86::ECX; break;
18685      case X86::BX: DestReg = X86::EBX; break;
18686      case X86::SI: DestReg = X86::ESI; break;
18687      case X86::DI: DestReg = X86::EDI; break;
18688      case X86::BP: DestReg = X86::EBP; break;
18689      case X86::SP: DestReg = X86::ESP; break;
18690      }
18691      if (DestReg) {
18692        Res.first = DestReg;
18693        Res.second = &X86::GR32RegClass;
18694      }
18695    } else if (VT == MVT::i64 || VT == MVT::f64) {
18696      unsigned DestReg = 0;
18697      switch (Res.first) {
18698      default: break;
18699      case X86::AX: DestReg = X86::RAX; break;
18700      case X86::DX: DestReg = X86::RDX; break;
18701      case X86::CX: DestReg = X86::RCX; break;
18702      case X86::BX: DestReg = X86::RBX; break;
18703      case X86::SI: DestReg = X86::RSI; break;
18704      case X86::DI: DestReg = X86::RDI; break;
18705      case X86::BP: DestReg = X86::RBP; break;
18706      case X86::SP: DestReg = X86::RSP; break;
18707      }
18708      if (DestReg) {
18709        Res.first = DestReg;
18710        Res.second = &X86::GR64RegClass;
18711      }
18712    }
18713  } else if (Res.second == &X86::FR32RegClass ||
18714             Res.second == &X86::FR64RegClass ||
18715             Res.second == &X86::VR128RegClass ||
18716             Res.second == &X86::VR256RegClass ||
18717             Res.second == &X86::FR32XRegClass ||
18718             Res.second == &X86::FR64XRegClass ||
18719             Res.second == &X86::VR128XRegClass ||
18720             Res.second == &X86::VR256XRegClass ||
18721             Res.second == &X86::VR512RegClass) {
18722    // Handle references to XMM physical registers that got mapped into the
18723    // wrong class.  This can happen with constraints like {xmm0} where the
18724    // target independent register mapper will just pick the first match it can
18725    // find, ignoring the required type.
18726
18727    if (VT == MVT::f32 || VT == MVT::i32)
18728      Res.second = &X86::FR32RegClass;
18729    else if (VT == MVT::f64 || VT == MVT::i64)
18730      Res.second = &X86::FR64RegClass;
18731    else if (X86::VR128RegClass.hasType(VT))
18732      Res.second = &X86::VR128RegClass;
18733    else if (X86::VR256RegClass.hasType(VT))
18734      Res.second = &X86::VR256RegClass;
18735    else if (X86::VR512RegClass.hasType(VT))
18736      Res.second = &X86::VR512RegClass;
18737  }
18738
18739  return Res;
18740}
18741