X86ISelLowering.cpp revision c7e77f91fecd662b198939a9a8ee0a0cc3828fc4
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86ISelLowering.h"
17#include "Utils/X86ShuffleDecode.h"
18#include "X86.h"
19#include "X86CallingConv.h"
20#include "X86InstrBuilder.h"
21#include "X86TargetMachine.h"
22#include "X86TargetObjectFile.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
25#include "llvm/ADT/StringExtras.h"
26#include "llvm/ADT/VariadicFunction.h"
27#include "llvm/CodeGen/IntrinsicLowering.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineJumpTableInfo.h"
32#include "llvm/CodeGen/MachineModuleInfo.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/IR/CallingConv.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/Function.h"
38#include "llvm/IR/GlobalAlias.h"
39#include "llvm/IR/GlobalVariable.h"
40#include "llvm/IR/Instructions.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/MC/MCAsmInfo.h"
44#include "llvm/MC/MCContext.h"
45#include "llvm/MC/MCExpr.h"
46#include "llvm/MC/MCSymbol.h"
47#include "llvm/Support/CallSite.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/MathExtras.h"
51#include "llvm/Target/TargetOptions.h"
52#include <bitset>
53#include <cctype>
54using namespace llvm;
55
56STATISTIC(NumTailCalls, "Number of tail calls");
57
58// Forward declarations.
59static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
60                       SDValue V2);
61
62static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
63                                SelectionDAG &DAG, SDLoc dl,
64                                unsigned vectorWidth) {
65  assert((vectorWidth == 128 || vectorWidth == 256) &&
66         "Unsupported vector width");
67  EVT VT = Vec.getValueType();
68  EVT ElVT = VT.getVectorElementType();
69  unsigned Factor = VT.getSizeInBits()/vectorWidth;
70  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
71                                  VT.getVectorNumElements()/Factor);
72
73  // Extract from UNDEF is UNDEF.
74  if (Vec.getOpcode() == ISD::UNDEF)
75    return DAG.getUNDEF(ResultVT);
76
77  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
78  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
79
80  // This is the index of the first element of the vectorWidth-bit chunk
81  // we want.
82  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
83                               * ElemsPerChunk);
84
85  // If the input is a buildvector just emit a smaller one.
86  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
87    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
88                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
89
90  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
91  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
92                               VecIdx);
93
94  return Result;
95
96}
97/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
98/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
99/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
100/// instructions or a simple subregister reference. Idx is an index in the
101/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
102/// lowering EXTRACT_VECTOR_ELT operations easier.
103static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
104                                   SelectionDAG &DAG, SDLoc dl) {
105  assert((Vec.getValueType().is256BitVector() ||
106          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
107  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
108}
109
110/// Generate a DAG to grab 256-bits from a 512-bit vector.
111static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
112                                   SelectionDAG &DAG, SDLoc dl) {
113  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
114  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
115}
116
117static SDValue InsertSubVector(SDValue Result, SDValue Vec,
118                               unsigned IdxVal, SelectionDAG &DAG,
119                               SDLoc dl, unsigned vectorWidth) {
120  assert((vectorWidth == 128 || vectorWidth == 256) &&
121         "Unsupported vector width");
122  // Inserting UNDEF is Result
123  if (Vec.getOpcode() == ISD::UNDEF)
124    return Result;
125  EVT VT = Vec.getValueType();
126  EVT ElVT = VT.getVectorElementType();
127  EVT ResultVT = Result.getValueType();
128
129  // Insert the relevant vectorWidth bits.
130  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
131
132  // This is the index of the first element of the vectorWidth-bit chunk
133  // we want.
134  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
135                               * ElemsPerChunk);
136
137  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
138  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
139                     VecIdx);
140}
141/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
142/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
143/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
144/// simple superregister reference.  Idx is an index in the 128 bits
145/// we want.  It need not be aligned to a 128-bit bounday.  That makes
146/// lowering INSERT_VECTOR_ELT operations easier.
147static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
148                                  unsigned IdxVal, SelectionDAG &DAG,
149                                  SDLoc dl) {
150  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
151  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
152}
153
154static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
155                                  unsigned IdxVal, SelectionDAG &DAG,
156                                  SDLoc dl) {
157  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
158  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
159}
160
161/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
162/// instructions. This is used because creating CONCAT_VECTOR nodes of
163/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
164/// large BUILD_VECTORS.
165static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
166                                   unsigned NumElems, SelectionDAG &DAG,
167                                   SDLoc dl) {
168  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
169  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
170}
171
172static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
173                                   unsigned NumElems, SelectionDAG &DAG,
174                                   SDLoc dl) {
175  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
176  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
177}
178
179static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
180  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
181  bool is64Bit = Subtarget->is64Bit();
182
183  if (Subtarget->isTargetEnvMacho()) {
184    if (is64Bit)
185      return new X86_64MachoTargetObjectFile();
186    return new TargetLoweringObjectFileMachO();
187  }
188
189  if (Subtarget->isTargetLinux())
190    return new X86LinuxTargetObjectFile();
191  if (Subtarget->isTargetELF())
192    return new TargetLoweringObjectFileELF();
193  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
194    return new TargetLoweringObjectFileCOFF();
195  llvm_unreachable("unknown subtarget type");
196}
197
198X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
199  : TargetLowering(TM, createTLOF(TM)) {
200  Subtarget = &TM.getSubtarget<X86Subtarget>();
201  X86ScalarSSEf64 = Subtarget->hasSSE2();
202  X86ScalarSSEf32 = Subtarget->hasSSE1();
203  TD = getDataLayout();
204
205  resetOperationActions();
206}
207
208void X86TargetLowering::resetOperationActions() {
209  const TargetMachine &TM = getTargetMachine();
210  static bool FirstTimeThrough = true;
211
212  // If none of the target options have changed, then we don't need to reset the
213  // operation actions.
214  if (!FirstTimeThrough && TO == TM.Options) return;
215
216  if (!FirstTimeThrough) {
217    // Reinitialize the actions.
218    initActions();
219    FirstTimeThrough = false;
220  }
221
222  TO = TM.Options;
223
224  // Set up the TargetLowering object.
225  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
226
227  // X86 is weird, it always uses i8 for shift amounts and setcc results.
228  setBooleanContents(ZeroOrOneBooleanContent);
229  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
230  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
231
232  // For 64-bit since we have so many registers use the ILP scheduler, for
233  // 32-bit code use the register pressure specific scheduling.
234  // For Atom, always use ILP scheduling.
235  if (Subtarget->isAtom())
236    setSchedulingPreference(Sched::ILP);
237  else if (Subtarget->is64Bit())
238    setSchedulingPreference(Sched::ILP);
239  else
240    setSchedulingPreference(Sched::RegPressure);
241  const X86RegisterInfo *RegInfo =
242    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
243  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
244
245  // Bypass expensive divides on Atom when compiling with O2
246  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
247    addBypassSlowDiv(32, 8);
248    if (Subtarget->is64Bit())
249      addBypassSlowDiv(64, 16);
250  }
251
252  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
253    // Setup Windows compiler runtime calls.
254    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
255    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
256    setLibcallName(RTLIB::SREM_I64, "_allrem");
257    setLibcallName(RTLIB::UREM_I64, "_aullrem");
258    setLibcallName(RTLIB::MUL_I64, "_allmul");
259    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
260    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
261    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
262    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
263    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
264
265    // The _ftol2 runtime function has an unusual calling conv, which
266    // is modeled by a special pseudo-instruction.
267    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
268    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
269    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
270    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
271  }
272
273  if (Subtarget->isTargetDarwin()) {
274    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
275    setUseUnderscoreSetJmp(false);
276    setUseUnderscoreLongJmp(false);
277  } else if (Subtarget->isTargetMingw()) {
278    // MS runtime is weird: it exports _setjmp, but longjmp!
279    setUseUnderscoreSetJmp(true);
280    setUseUnderscoreLongJmp(false);
281  } else {
282    setUseUnderscoreSetJmp(true);
283    setUseUnderscoreLongJmp(true);
284  }
285
286  // Set up the register classes.
287  addRegisterClass(MVT::i8, &X86::GR8RegClass);
288  addRegisterClass(MVT::i16, &X86::GR16RegClass);
289  addRegisterClass(MVT::i32, &X86::GR32RegClass);
290  if (Subtarget->is64Bit())
291    addRegisterClass(MVT::i64, &X86::GR64RegClass);
292
293  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
294
295  // We don't accept any truncstore of integer registers.
296  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
297  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
298  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
299  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
300  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
301  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
302
303  // SETOEQ and SETUNE require checking two conditions.
304  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
305  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
306  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
307  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
308  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
309  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
310
311  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
312  // operation.
313  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
314  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
315  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
316
317  if (Subtarget->is64Bit()) {
318    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
319    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
320  } else if (!TM.Options.UseSoftFloat) {
321    // We have an algorithm for SSE2->double, and we turn this into a
322    // 64-bit FILD followed by conditional FADD for other targets.
323    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
324    // We have an algorithm for SSE2, and we turn this into a 64-bit
325    // FILD for other targets.
326    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
327  }
328
329  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
330  // this operation.
331  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
332  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
333
334  if (!TM.Options.UseSoftFloat) {
335    // SSE has no i16 to fp conversion, only i32
336    if (X86ScalarSSEf32) {
337      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
338      // f32 and f64 cases are Legal, f80 case is not
339      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
340    } else {
341      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
342      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
343    }
344  } else {
345    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
346    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
347  }
348
349  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
350  // are Legal, f80 is custom lowered.
351  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
352  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
353
354  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
355  // this operation.
356  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
357  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
358
359  if (X86ScalarSSEf32) {
360    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
361    // f32 and f64 cases are Legal, f80 case is not
362    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
363  } else {
364    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
365    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
366  }
367
368  // Handle FP_TO_UINT by promoting the destination to a larger signed
369  // conversion.
370  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
371  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
372  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
373
374  if (Subtarget->is64Bit()) {
375    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
376    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
377  } else if (!TM.Options.UseSoftFloat) {
378    // Since AVX is a superset of SSE3, only check for SSE here.
379    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
380      // Expand FP_TO_UINT into a select.
381      // FIXME: We would like to use a Custom expander here eventually to do
382      // the optimal thing for SSE vs. the default expansion in the legalizer.
383      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
384    else
385      // With SSE3 we can use fisttpll to convert to a signed i64; without
386      // SSE, we're stuck with a fistpll.
387      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
388  }
389
390  if (isTargetFTOL()) {
391    // Use the _ftol2 runtime function, which has a pseudo-instruction
392    // to handle its weird calling convention.
393    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
394  }
395
396  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
397  if (!X86ScalarSSEf64) {
398    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
399    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
400    if (Subtarget->is64Bit()) {
401      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
402      // Without SSE, i64->f64 goes through memory.
403      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
404    }
405  }
406
407  // Scalar integer divide and remainder are lowered to use operations that
408  // produce two results, to match the available instructions. This exposes
409  // the two-result form to trivial CSE, which is able to combine x/y and x%y
410  // into a single instruction.
411  //
412  // Scalar integer multiply-high is also lowered to use two-result
413  // operations, to match the available instructions. However, plain multiply
414  // (low) operations are left as Legal, as there are single-result
415  // instructions for this in x86. Using the two-result multiply instructions
416  // when both high and low results are needed must be arranged by dagcombine.
417  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
418    MVT VT = IntVTs[i];
419    setOperationAction(ISD::MULHS, VT, Expand);
420    setOperationAction(ISD::MULHU, VT, Expand);
421    setOperationAction(ISD::SDIV, VT, Expand);
422    setOperationAction(ISD::UDIV, VT, Expand);
423    setOperationAction(ISD::SREM, VT, Expand);
424    setOperationAction(ISD::UREM, VT, Expand);
425
426    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
427    setOperationAction(ISD::ADDC, VT, Custom);
428    setOperationAction(ISD::ADDE, VT, Custom);
429    setOperationAction(ISD::SUBC, VT, Custom);
430    setOperationAction(ISD::SUBE, VT, Custom);
431  }
432
433  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
434  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
435  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
436  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
437  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
438  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
439  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
440  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
441  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
442  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
443  if (Subtarget->is64Bit())
444    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
445  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
446  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
447  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
448  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
449  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
450  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
451  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
452  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
453
454  // Promote the i8 variants and force them on up to i32 which has a shorter
455  // encoding.
456  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
457  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
458  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
459  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
460  if (Subtarget->hasBMI()) {
461    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
462    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
463    if (Subtarget->is64Bit())
464      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
465  } else {
466    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
467    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
468    if (Subtarget->is64Bit())
469      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
470  }
471
472  if (Subtarget->hasLZCNT()) {
473    // When promoting the i8 variants, force them to i32 for a shorter
474    // encoding.
475    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
476    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
477    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
478    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
479    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
480    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
481    if (Subtarget->is64Bit())
482      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
483  } else {
484    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
485    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
486    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
487    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
488    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
489    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
490    if (Subtarget->is64Bit()) {
491      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
492      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
493    }
494  }
495
496  if (Subtarget->hasPOPCNT()) {
497    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
498  } else {
499    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
500    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
501    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
502    if (Subtarget->is64Bit())
503      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
504  }
505
506  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
507  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
508
509  // These should be promoted to a larger select which is supported.
510  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
511  // X86 wants to expand cmov itself.
512  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
513  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
514  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
515  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
516  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
517  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
518  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
519  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
520  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
521  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
522  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
523  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
524  if (Subtarget->is64Bit()) {
525    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
526    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
527  }
528  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
529  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
530  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
531  // support continuation, user-level threading, and etc.. As a result, no
532  // other SjLj exception interfaces are implemented and please don't build
533  // your own exception handling based on them.
534  // LLVM/Clang supports zero-cost DWARF exception handling.
535  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
536  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
537
538  // Darwin ABI issue.
539  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
540  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
541  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
542  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
543  if (Subtarget->is64Bit())
544    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
545  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
546  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
547  if (Subtarget->is64Bit()) {
548    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
549    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
550    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
551    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
552    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
553  }
554  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
555  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
556  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
557  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
558  if (Subtarget->is64Bit()) {
559    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
560    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
561    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
562  }
563
564  if (Subtarget->hasSSE1())
565    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
566
567  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
568
569  // Expand certain atomics
570  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
571    MVT VT = IntVTs[i];
572    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
573    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
574    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
575  }
576
577  if (!Subtarget->is64Bit()) {
578    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
579    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
580    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
581    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
582    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
583    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
584    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
585    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
586    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
587    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
588    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
589    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
590  }
591
592  if (Subtarget->hasCmpxchg16b()) {
593    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
594  }
595
596  // FIXME - use subtarget debug flags
597  if (!Subtarget->isTargetDarwin() &&
598      !Subtarget->isTargetELF() &&
599      !Subtarget->isTargetCygMing()) {
600    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
601  }
602
603  if (Subtarget->is64Bit()) {
604    setExceptionPointerRegister(X86::RAX);
605    setExceptionSelectorRegister(X86::RDX);
606  } else {
607    setExceptionPointerRegister(X86::EAX);
608    setExceptionSelectorRegister(X86::EDX);
609  }
610  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
611  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
612
613  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
614  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
615
616  setOperationAction(ISD::TRAP, MVT::Other, Legal);
617  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
618
619  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
620  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
621  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
622  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
623    // TargetInfo::X86_64ABIBuiltinVaList
624    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
625    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
626  } else {
627    // TargetInfo::CharPtrBuiltinVaList
628    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
629    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
630  }
631
632  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
633  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
634
635  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
636    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
637                       MVT::i64 : MVT::i32, Custom);
638  else if (TM.Options.EnableSegmentedStacks)
639    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
640                       MVT::i64 : MVT::i32, Custom);
641  else
642    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
643                       MVT::i64 : MVT::i32, Expand);
644
645  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
646    // f32 and f64 use SSE.
647    // Set up the FP register classes.
648    addRegisterClass(MVT::f32, &X86::FR32RegClass);
649    addRegisterClass(MVT::f64, &X86::FR64RegClass);
650
651    // Use ANDPD to simulate FABS.
652    setOperationAction(ISD::FABS , MVT::f64, Custom);
653    setOperationAction(ISD::FABS , MVT::f32, Custom);
654
655    // Use XORP to simulate FNEG.
656    setOperationAction(ISD::FNEG , MVT::f64, Custom);
657    setOperationAction(ISD::FNEG , MVT::f32, Custom);
658
659    // Use ANDPD and ORPD to simulate FCOPYSIGN.
660    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
661    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
662
663    // Lower this to FGETSIGNx86 plus an AND.
664    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
665    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
666
667    // We don't support sin/cos/fmod
668    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
669    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
670    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
671    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
672    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
673    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
674
675    // Expand FP immediates into loads from the stack, except for the special
676    // cases we handle.
677    addLegalFPImmediate(APFloat(+0.0)); // xorpd
678    addLegalFPImmediate(APFloat(+0.0f)); // xorps
679  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
680    // Use SSE for f32, x87 for f64.
681    // Set up the FP register classes.
682    addRegisterClass(MVT::f32, &X86::FR32RegClass);
683    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
684
685    // Use ANDPS to simulate FABS.
686    setOperationAction(ISD::FABS , MVT::f32, Custom);
687
688    // Use XORP to simulate FNEG.
689    setOperationAction(ISD::FNEG , MVT::f32, Custom);
690
691    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
692
693    // Use ANDPS and ORPS to simulate FCOPYSIGN.
694    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
695    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
696
697    // We don't support sin/cos/fmod
698    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
699    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
700    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
701
702    // Special cases we handle for FP constants.
703    addLegalFPImmediate(APFloat(+0.0f)); // xorps
704    addLegalFPImmediate(APFloat(+0.0)); // FLD0
705    addLegalFPImmediate(APFloat(+1.0)); // FLD1
706    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
707    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
708
709    if (!TM.Options.UnsafeFPMath) {
710      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
711      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
712      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
713    }
714  } else if (!TM.Options.UseSoftFloat) {
715    // f32 and f64 in x87.
716    // Set up the FP register classes.
717    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
718    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
719
720    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
721    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
722    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
723    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
724
725    if (!TM.Options.UnsafeFPMath) {
726      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
727      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
728      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
729      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
730      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
731      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
732    }
733    addLegalFPImmediate(APFloat(+0.0)); // FLD0
734    addLegalFPImmediate(APFloat(+1.0)); // FLD1
735    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
736    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
737    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
738    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
739    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
740    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
741  }
742
743  // We don't support FMA.
744  setOperationAction(ISD::FMA, MVT::f64, Expand);
745  setOperationAction(ISD::FMA, MVT::f32, Expand);
746
747  // Long double always uses X87.
748  if (!TM.Options.UseSoftFloat) {
749    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
750    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
751    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
752    {
753      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
754      addLegalFPImmediate(TmpFlt);  // FLD0
755      TmpFlt.changeSign();
756      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
757
758      bool ignored;
759      APFloat TmpFlt2(+1.0);
760      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
761                      &ignored);
762      addLegalFPImmediate(TmpFlt2);  // FLD1
763      TmpFlt2.changeSign();
764      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
765    }
766
767    if (!TM.Options.UnsafeFPMath) {
768      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
769      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
770      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
771    }
772
773    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
774    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
775    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
776    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
777    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
778    setOperationAction(ISD::FMA, MVT::f80, Expand);
779  }
780
781  // Always use a library call for pow.
782  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
783  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
784  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
785
786  setOperationAction(ISD::FLOG, MVT::f80, Expand);
787  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
788  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
789  setOperationAction(ISD::FEXP, MVT::f80, Expand);
790  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
791
792  // First set operation action for all vector types to either promote
793  // (for widening) or expand (for scalarization). Then we will selectively
794  // turn on ones that can be effectively codegen'd.
795  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
796           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
797    MVT VT = (MVT::SimpleValueType)i;
798    setOperationAction(ISD::ADD , VT, Expand);
799    setOperationAction(ISD::SUB , VT, Expand);
800    setOperationAction(ISD::FADD, VT, Expand);
801    setOperationAction(ISD::FNEG, VT, Expand);
802    setOperationAction(ISD::FSUB, VT, Expand);
803    setOperationAction(ISD::MUL , VT, Expand);
804    setOperationAction(ISD::FMUL, VT, Expand);
805    setOperationAction(ISD::SDIV, VT, Expand);
806    setOperationAction(ISD::UDIV, VT, Expand);
807    setOperationAction(ISD::FDIV, VT, Expand);
808    setOperationAction(ISD::SREM, VT, Expand);
809    setOperationAction(ISD::UREM, VT, Expand);
810    setOperationAction(ISD::LOAD, VT, Expand);
811    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
812    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
813    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
814    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
815    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
816    setOperationAction(ISD::FABS, VT, Expand);
817    setOperationAction(ISD::FSIN, VT, Expand);
818    setOperationAction(ISD::FSINCOS, VT, Expand);
819    setOperationAction(ISD::FCOS, VT, Expand);
820    setOperationAction(ISD::FSINCOS, VT, Expand);
821    setOperationAction(ISD::FREM, VT, Expand);
822    setOperationAction(ISD::FMA,  VT, Expand);
823    setOperationAction(ISD::FPOWI, VT, Expand);
824    setOperationAction(ISD::FSQRT, VT, Expand);
825    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
826    setOperationAction(ISD::FFLOOR, VT, Expand);
827    setOperationAction(ISD::FCEIL, VT, Expand);
828    setOperationAction(ISD::FTRUNC, VT, Expand);
829    setOperationAction(ISD::FRINT, VT, Expand);
830    setOperationAction(ISD::FNEARBYINT, VT, Expand);
831    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
832    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
833    setOperationAction(ISD::SDIVREM, VT, Expand);
834    setOperationAction(ISD::UDIVREM, VT, Expand);
835    setOperationAction(ISD::FPOW, VT, Expand);
836    setOperationAction(ISD::CTPOP, VT, Expand);
837    setOperationAction(ISD::CTTZ, VT, Expand);
838    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
839    setOperationAction(ISD::CTLZ, VT, Expand);
840    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
841    setOperationAction(ISD::SHL, VT, Expand);
842    setOperationAction(ISD::SRA, VT, Expand);
843    setOperationAction(ISD::SRL, VT, Expand);
844    setOperationAction(ISD::ROTL, VT, Expand);
845    setOperationAction(ISD::ROTR, VT, Expand);
846    setOperationAction(ISD::BSWAP, VT, Expand);
847    setOperationAction(ISD::SETCC, VT, Expand);
848    setOperationAction(ISD::FLOG, VT, Expand);
849    setOperationAction(ISD::FLOG2, VT, Expand);
850    setOperationAction(ISD::FLOG10, VT, Expand);
851    setOperationAction(ISD::FEXP, VT, Expand);
852    setOperationAction(ISD::FEXP2, VT, Expand);
853    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
854    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
855    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
856    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
857    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
858    setOperationAction(ISD::TRUNCATE, VT, Expand);
859    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
860    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
861    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
862    setOperationAction(ISD::VSELECT, VT, Expand);
863    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
864             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
865      setTruncStoreAction(VT,
866                          (MVT::SimpleValueType)InnerVT, Expand);
867    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
868    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
869    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
870  }
871
872  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
873  // with -msoft-float, disable use of MMX as well.
874  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
875    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
876    // No operations on x86mmx supported, everything uses intrinsics.
877  }
878
879  // MMX-sized vectors (other than x86mmx) are expected to be expanded
880  // into smaller operations.
881  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
882  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
883  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
884  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
885  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
886  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
887  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
888  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
889  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
890  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
891  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
892  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
893  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
894  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
895  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
896  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
897  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
898  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
899  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
900  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
901  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
902  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
903  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
904  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
905  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
906  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
907  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
908  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
909  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
910
911  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
912    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
913
914    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
915    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
916    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
917    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
918    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
919    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
920    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
921    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
922    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
923    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
924    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
925    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
926  }
927
928  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
929    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
930
931    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
932    // registers cannot be used even for integer operations.
933    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
934    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
935    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
936    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
937
938    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
939    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
940    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
941    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
942    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
943    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
944    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
945    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
946    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
947    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
948    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
949    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
950    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
951    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
952    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
953    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
954    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
955    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
956
957    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
958    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
959    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
960    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
961
962    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
963    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
964    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
965    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
966    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
967
968    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
969    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
970      MVT VT = (MVT::SimpleValueType)i;
971      // Do not attempt to custom lower non-power-of-2 vectors
972      if (!isPowerOf2_32(VT.getVectorNumElements()))
973        continue;
974      // Do not attempt to custom lower non-128-bit vectors
975      if (!VT.is128BitVector())
976        continue;
977      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
978      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
979      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
980    }
981
982    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
983    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
984    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
985    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
986    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
987    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
988
989    if (Subtarget->is64Bit()) {
990      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
991      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
992    }
993
994    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
995    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
996      MVT VT = (MVT::SimpleValueType)i;
997
998      // Do not attempt to promote non-128-bit vectors
999      if (!VT.is128BitVector())
1000        continue;
1001
1002      setOperationAction(ISD::AND,    VT, Promote);
1003      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1004      setOperationAction(ISD::OR,     VT, Promote);
1005      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1006      setOperationAction(ISD::XOR,    VT, Promote);
1007      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1008      setOperationAction(ISD::LOAD,   VT, Promote);
1009      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1010      setOperationAction(ISD::SELECT, VT, Promote);
1011      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1012    }
1013
1014    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1015
1016    // Custom lower v2i64 and v2f64 selects.
1017    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1018    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1019    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1020    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1021
1022    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1023    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1024
1025    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1026    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1027    // As there is no 64-bit GPR available, we need build a special custom
1028    // sequence to convert from v2i32 to v2f32.
1029    if (!Subtarget->is64Bit())
1030      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1031
1032    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1033    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1034
1035    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
1036  }
1037
1038  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1039    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1040    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1041    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1042    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1043    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1044    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1045    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1046    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1047    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1048    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1049
1050    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1051    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1052    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1053    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1054    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1055    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1056    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1057    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1058    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1059    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1060
1061    // FIXME: Do we need to handle scalar-to-vector here?
1062    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1063
1064    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
1065    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
1066    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1067    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
1068    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
1069
1070    // i8 and i16 vectors are custom , because the source register and source
1071    // source memory operand types are not the same width.  f32 vectors are
1072    // custom since the immediate controlling the insert encodes additional
1073    // information.
1074    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1075    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1076    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1077    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1078
1079    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1080    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1081    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1082    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1083
1084    // FIXME: these should be Legal but thats only for the case where
1085    // the index is constant.  For now custom expand to deal with that.
1086    if (Subtarget->is64Bit()) {
1087      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1088      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1089    }
1090  }
1091
1092  if (Subtarget->hasSSE2()) {
1093    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1094    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1095
1096    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1097    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1098
1099    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1100    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1101
1102    // In the customized shift lowering, the legal cases in AVX2 will be
1103    // recognized.
1104    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1105    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1106
1107    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1108    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1109
1110    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1111
1112    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
1113    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
1114  }
1115
1116  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1117    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1118    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1119    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1120    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1121    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1122    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1123
1124    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1125    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1126    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1127
1128    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1129    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1130    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1131    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1132    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1133    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1134    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1135    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1136    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1137    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1138    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1139    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1140
1141    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1142    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1143    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1144    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1145    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1146    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1147    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1148    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1149    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1150    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1151    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1152    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1153
1154    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
1155
1156    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1157    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1158    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1159    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1160
1161    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1162    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1163
1164    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
1165
1166    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1167    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1168
1169    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1170    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1171
1172    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1173    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1174
1175    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
1176
1177    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1178    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1179    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1180    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1181
1182    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1183    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1184    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1185
1186    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
1187    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
1188    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
1189    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
1190
1191    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1192    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1193    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1194    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1195    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1196    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1197    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1198    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1199    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1200    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1201    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1202    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1203
1204    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1205      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1206      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1207      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1208      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1209      setOperationAction(ISD::FMA,             MVT::f32, Legal);
1210      setOperationAction(ISD::FMA,             MVT::f64, Legal);
1211    }
1212
1213    if (Subtarget->hasInt256()) {
1214      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1215      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1216      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1217      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1218
1219      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1220      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1221      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1222      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1223
1224      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1225      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1226      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1227      // Don't lower v32i8 because there is no 128-bit byte mul
1228
1229      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1230
1231      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
1232    } else {
1233      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1234      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1235      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1236      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1237
1238      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1239      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1240      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1241      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1242
1243      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1244      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1245      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1246      // Don't lower v32i8 because there is no 128-bit byte mul
1247    }
1248
1249    // In the customized shift lowering, the legal cases in AVX2 will be
1250    // recognized.
1251    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1252    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1253
1254    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1255    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1256
1257    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1258
1259    // Custom lower several nodes for 256-bit types.
1260    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1261             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1262      MVT VT = (MVT::SimpleValueType)i;
1263
1264      // Extract subvector is special because the value type
1265      // (result) is 128-bit but the source is 256-bit wide.
1266      if (VT.is128BitVector())
1267        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1268
1269      // Do not attempt to custom lower other non-256-bit vectors
1270      if (!VT.is256BitVector())
1271        continue;
1272
1273      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1274      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1275      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1276      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1277      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1278      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1279      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1280    }
1281
1282    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1283    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1284      MVT VT = (MVT::SimpleValueType)i;
1285
1286      // Do not attempt to promote non-256-bit vectors
1287      if (!VT.is256BitVector())
1288        continue;
1289
1290      setOperationAction(ISD::AND,    VT, Promote);
1291      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1292      setOperationAction(ISD::OR,     VT, Promote);
1293      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1294      setOperationAction(ISD::XOR,    VT, Promote);
1295      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1296      setOperationAction(ISD::LOAD,   VT, Promote);
1297      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1298      setOperationAction(ISD::SELECT, VT, Promote);
1299      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1300    }
1301  }
1302
1303  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1304    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1305    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1306    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1307    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1308
1309    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1310    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1311
1312    setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
1313    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1314    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1315    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1316    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1317    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1318
1319    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1320    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1321    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1322    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1323    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1324    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1325
1326    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1327    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1328    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1329    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1330    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1331    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1332    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1333    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1334    setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
1335
1336    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1337    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1338    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1339    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1340    if (Subtarget->is64Bit()) {
1341      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1342      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1343      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1344      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1345    }
1346    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1347    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1348    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1349    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1350    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1351    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1352    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1353    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1354
1355    setOperationAction(ISD::TRUNCATE,           MVT::i1, Legal);
1356    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1357    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1358    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1359    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1360    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1361    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1362    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1363    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1364    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1365    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1366    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1367
1368    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1369    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1370    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1371    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1372    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1373
1374    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1375    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1376
1377    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1378
1379    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1380    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1381    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1382    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1383    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1384
1385    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1386    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1387
1388    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1389    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1390
1391    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1392
1393    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1394    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1395
1396    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1397    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1398
1399    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1400    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1401
1402    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1403    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1404    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1405    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1406    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1407    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1408
1409    // Custom lower several nodes.
1410    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1411             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1412      MVT VT = (MVT::SimpleValueType)i;
1413
1414      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1415      // Extract subvector is special because the value type
1416      // (result) is 256/128-bit but the source is 512-bit wide.
1417      if (VT.is128BitVector() || VT.is256BitVector())
1418        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1419
1420      if (VT.getVectorElementType() == MVT::i1)
1421        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1422
1423      // Do not attempt to custom lower other non-512-bit vectors
1424      if (!VT.is512BitVector())
1425        continue;
1426
1427      if ( EltSize >= 32) {
1428        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1429        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1430        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1431        setOperationAction(ISD::VSELECT,             VT, Legal);
1432        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1433        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1434        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1435      }
1436    }
1437    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1438      MVT VT = (MVT::SimpleValueType)i;
1439
1440      // Do not attempt to promote non-256-bit vectors
1441      if (!VT.is512BitVector())
1442        continue;
1443
1444      setOperationAction(ISD::SELECT, VT, Promote);
1445      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1446    }
1447  }// has  AVX-512
1448
1449  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1450  // of this type with custom code.
1451  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1452           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1453    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1454                       Custom);
1455  }
1456
1457  // We want to custom lower some of our intrinsics.
1458  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1459  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1460  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1461
1462  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1463  // handle type legalization for these operations here.
1464  //
1465  // FIXME: We really should do custom legalization for addition and
1466  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1467  // than generic legalization for 64-bit multiplication-with-overflow, though.
1468  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1469    // Add/Sub/Mul with overflow operations are custom lowered.
1470    MVT VT = IntVTs[i];
1471    setOperationAction(ISD::SADDO, VT, Custom);
1472    setOperationAction(ISD::UADDO, VT, Custom);
1473    setOperationAction(ISD::SSUBO, VT, Custom);
1474    setOperationAction(ISD::USUBO, VT, Custom);
1475    setOperationAction(ISD::SMULO, VT, Custom);
1476    setOperationAction(ISD::UMULO, VT, Custom);
1477  }
1478
1479  // There are no 8-bit 3-address imul/mul instructions
1480  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1481  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1482
1483  if (!Subtarget->is64Bit()) {
1484    // These libcalls are not available in 32-bit.
1485    setLibcallName(RTLIB::SHL_I128, 0);
1486    setLibcallName(RTLIB::SRL_I128, 0);
1487    setLibcallName(RTLIB::SRA_I128, 0);
1488  }
1489
1490  // Combine sin / cos into one node or libcall if possible.
1491  if (Subtarget->hasSinCos()) {
1492    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1493    setLibcallName(RTLIB::SINCOS_F64, "sincos");
1494    if (Subtarget->isTargetDarwin()) {
1495      // For MacOSX, we don't want to the normal expansion of a libcall to
1496      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
1497      // traffic.
1498      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1499      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1500    }
1501  }
1502
1503  // We have target-specific dag combine patterns for the following nodes:
1504  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1505  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1506  setTargetDAGCombine(ISD::VSELECT);
1507  setTargetDAGCombine(ISD::SELECT);
1508  setTargetDAGCombine(ISD::SHL);
1509  setTargetDAGCombine(ISD::SRA);
1510  setTargetDAGCombine(ISD::SRL);
1511  setTargetDAGCombine(ISD::OR);
1512  setTargetDAGCombine(ISD::AND);
1513  setTargetDAGCombine(ISD::ADD);
1514  setTargetDAGCombine(ISD::FADD);
1515  setTargetDAGCombine(ISD::FSUB);
1516  setTargetDAGCombine(ISD::FMA);
1517  setTargetDAGCombine(ISD::SUB);
1518  setTargetDAGCombine(ISD::LOAD);
1519  setTargetDAGCombine(ISD::STORE);
1520  setTargetDAGCombine(ISD::ZERO_EXTEND);
1521  setTargetDAGCombine(ISD::ANY_EXTEND);
1522  setTargetDAGCombine(ISD::SIGN_EXTEND);
1523  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1524  setTargetDAGCombine(ISD::TRUNCATE);
1525  setTargetDAGCombine(ISD::SINT_TO_FP);
1526  setTargetDAGCombine(ISD::SETCC);
1527  if (Subtarget->is64Bit())
1528    setTargetDAGCombine(ISD::MUL);
1529  setTargetDAGCombine(ISD::XOR);
1530
1531  computeRegisterProperties();
1532
1533  // On Darwin, -Os means optimize for size without hurting performance,
1534  // do not reduce the limit.
1535  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1536  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1537  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1538  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1539  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1540  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1541  setPrefLoopAlignment(4); // 2^4 bytes.
1542
1543  // Predictable cmov don't hurt on atom because it's in-order.
1544  PredictableSelectIsExpensive = !Subtarget->isAtom();
1545
1546  setPrefFunctionAlignment(4); // 2^4 bytes.
1547}
1548
1549EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1550  if (!VT.isVector())
1551    return MVT::i8;
1552
1553  const TargetMachine &TM = getTargetMachine();
1554  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512())
1555    switch(VT.getVectorNumElements()) {
1556    case  8: return MVT::v8i1;
1557    case 16: return MVT::v16i1;
1558    }
1559
1560  return VT.changeVectorElementTypeToInteger();
1561}
1562
1563/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1564/// the desired ByVal argument alignment.
1565static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1566  if (MaxAlign == 16)
1567    return;
1568  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1569    if (VTy->getBitWidth() == 128)
1570      MaxAlign = 16;
1571  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1572    unsigned EltAlign = 0;
1573    getMaxByValAlign(ATy->getElementType(), EltAlign);
1574    if (EltAlign > MaxAlign)
1575      MaxAlign = EltAlign;
1576  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1577    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1578      unsigned EltAlign = 0;
1579      getMaxByValAlign(STy->getElementType(i), EltAlign);
1580      if (EltAlign > MaxAlign)
1581        MaxAlign = EltAlign;
1582      if (MaxAlign == 16)
1583        break;
1584    }
1585  }
1586}
1587
1588/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1589/// function arguments in the caller parameter area. For X86, aggregates
1590/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1591/// are at 4-byte boundaries.
1592unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1593  if (Subtarget->is64Bit()) {
1594    // Max of 8 and alignment of type.
1595    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1596    if (TyAlign > 8)
1597      return TyAlign;
1598    return 8;
1599  }
1600
1601  unsigned Align = 4;
1602  if (Subtarget->hasSSE1())
1603    getMaxByValAlign(Ty, Align);
1604  return Align;
1605}
1606
1607/// getOptimalMemOpType - Returns the target specific optimal type for load
1608/// and store operations as a result of memset, memcpy, and memmove
1609/// lowering. If DstAlign is zero that means it's safe to destination
1610/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1611/// means there isn't a need to check it against alignment requirement,
1612/// probably because the source does not need to be loaded. If 'IsMemset' is
1613/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1614/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1615/// source is constant so it does not need to be loaded.
1616/// It returns EVT::Other if the type should be determined using generic
1617/// target-independent logic.
1618EVT
1619X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1620                                       unsigned DstAlign, unsigned SrcAlign,
1621                                       bool IsMemset, bool ZeroMemset,
1622                                       bool MemcpyStrSrc,
1623                                       MachineFunction &MF) const {
1624  const Function *F = MF.getFunction();
1625  if ((!IsMemset || ZeroMemset) &&
1626      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1627                                       Attribute::NoImplicitFloat)) {
1628    if (Size >= 16 &&
1629        (Subtarget->isUnalignedMemAccessFast() ||
1630         ((DstAlign == 0 || DstAlign >= 16) &&
1631          (SrcAlign == 0 || SrcAlign >= 16)))) {
1632      if (Size >= 32) {
1633        if (Subtarget->hasInt256())
1634          return MVT::v8i32;
1635        if (Subtarget->hasFp256())
1636          return MVT::v8f32;
1637      }
1638      if (Subtarget->hasSSE2())
1639        return MVT::v4i32;
1640      if (Subtarget->hasSSE1())
1641        return MVT::v4f32;
1642    } else if (!MemcpyStrSrc && Size >= 8 &&
1643               !Subtarget->is64Bit() &&
1644               Subtarget->hasSSE2()) {
1645      // Do not use f64 to lower memcpy if source is string constant. It's
1646      // better to use i32 to avoid the loads.
1647      return MVT::f64;
1648    }
1649  }
1650  if (Subtarget->is64Bit() && Size >= 8)
1651    return MVT::i64;
1652  return MVT::i32;
1653}
1654
1655bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1656  if (VT == MVT::f32)
1657    return X86ScalarSSEf32;
1658  else if (VT == MVT::f64)
1659    return X86ScalarSSEf64;
1660  return true;
1661}
1662
1663bool
1664X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
1665  if (Fast)
1666    *Fast = Subtarget->isUnalignedMemAccessFast();
1667  return true;
1668}
1669
1670/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1671/// current function.  The returned value is a member of the
1672/// MachineJumpTableInfo::JTEntryKind enum.
1673unsigned X86TargetLowering::getJumpTableEncoding() const {
1674  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1675  // symbol.
1676  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1677      Subtarget->isPICStyleGOT())
1678    return MachineJumpTableInfo::EK_Custom32;
1679
1680  // Otherwise, use the normal jump table encoding heuristics.
1681  return TargetLowering::getJumpTableEncoding();
1682}
1683
1684const MCExpr *
1685X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1686                                             const MachineBasicBlock *MBB,
1687                                             unsigned uid,MCContext &Ctx) const{
1688  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1689         Subtarget->isPICStyleGOT());
1690  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1691  // entries.
1692  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1693                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1694}
1695
1696/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1697/// jumptable.
1698SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1699                                                    SelectionDAG &DAG) const {
1700  if (!Subtarget->is64Bit())
1701    // This doesn't have SDLoc associated with it, but is not really the
1702    // same as a Register.
1703    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1704  return Table;
1705}
1706
1707/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1708/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1709/// MCExpr.
1710const MCExpr *X86TargetLowering::
1711getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1712                             MCContext &Ctx) const {
1713  // X86-64 uses RIP relative addressing based on the jump table label.
1714  if (Subtarget->isPICStyleRIPRel())
1715    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1716
1717  // Otherwise, the reference is relative to the PIC base.
1718  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1719}
1720
1721// FIXME: Why this routine is here? Move to RegInfo!
1722std::pair<const TargetRegisterClass*, uint8_t>
1723X86TargetLowering::findRepresentativeClass(MVT VT) const{
1724  const TargetRegisterClass *RRC = 0;
1725  uint8_t Cost = 1;
1726  switch (VT.SimpleTy) {
1727  default:
1728    return TargetLowering::findRepresentativeClass(VT);
1729  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1730    RRC = Subtarget->is64Bit() ?
1731      (const TargetRegisterClass*)&X86::GR64RegClass :
1732      (const TargetRegisterClass*)&X86::GR32RegClass;
1733    break;
1734  case MVT::x86mmx:
1735    RRC = &X86::VR64RegClass;
1736    break;
1737  case MVT::f32: case MVT::f64:
1738  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1739  case MVT::v4f32: case MVT::v2f64:
1740  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1741  case MVT::v4f64:
1742    RRC = &X86::VR128RegClass;
1743    break;
1744  }
1745  return std::make_pair(RRC, Cost);
1746}
1747
1748bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1749                                               unsigned &Offset) const {
1750  if (!Subtarget->isTargetLinux())
1751    return false;
1752
1753  if (Subtarget->is64Bit()) {
1754    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1755    Offset = 0x28;
1756    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1757      AddressSpace = 256;
1758    else
1759      AddressSpace = 257;
1760  } else {
1761    // %gs:0x14 on i386
1762    Offset = 0x14;
1763    AddressSpace = 256;
1764  }
1765  return true;
1766}
1767
1768//===----------------------------------------------------------------------===//
1769//               Return Value Calling Convention Implementation
1770//===----------------------------------------------------------------------===//
1771
1772#include "X86GenCallingConv.inc"
1773
1774bool
1775X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1776                                  MachineFunction &MF, bool isVarArg,
1777                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1778                        LLVMContext &Context) const {
1779  SmallVector<CCValAssign, 16> RVLocs;
1780  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1781                 RVLocs, Context);
1782  return CCInfo.CheckReturn(Outs, RetCC_X86);
1783}
1784
1785const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
1786  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
1787  return ScratchRegs;
1788}
1789
1790SDValue
1791X86TargetLowering::LowerReturn(SDValue Chain,
1792                               CallingConv::ID CallConv, bool isVarArg,
1793                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1794                               const SmallVectorImpl<SDValue> &OutVals,
1795                               SDLoc dl, SelectionDAG &DAG) const {
1796  MachineFunction &MF = DAG.getMachineFunction();
1797  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1798
1799  SmallVector<CCValAssign, 16> RVLocs;
1800  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1801                 RVLocs, *DAG.getContext());
1802  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1803
1804  SDValue Flag;
1805  SmallVector<SDValue, 6> RetOps;
1806  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1807  // Operand #1 = Bytes To Pop
1808  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1809                   MVT::i16));
1810
1811  // Copy the result values into the output registers.
1812  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1813    CCValAssign &VA = RVLocs[i];
1814    assert(VA.isRegLoc() && "Can only return in registers!");
1815    SDValue ValToCopy = OutVals[i];
1816    EVT ValVT = ValToCopy.getValueType();
1817
1818    // Promote values to the appropriate types
1819    if (VA.getLocInfo() == CCValAssign::SExt)
1820      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1821    else if (VA.getLocInfo() == CCValAssign::ZExt)
1822      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1823    else if (VA.getLocInfo() == CCValAssign::AExt)
1824      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1825    else if (VA.getLocInfo() == CCValAssign::BCvt)
1826      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1827
1828    // If this is x86-64, and we disabled SSE, we can't return FP values,
1829    // or SSE or MMX vectors.
1830    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1831         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1832          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1833      report_fatal_error("SSE register return with SSE disabled");
1834    }
1835    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1836    // llvm-gcc has never done it right and no one has noticed, so this
1837    // should be OK for now.
1838    if (ValVT == MVT::f64 &&
1839        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1840      report_fatal_error("SSE2 register return with SSE2 disabled");
1841
1842    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1843    // the RET instruction and handled by the FP Stackifier.
1844    if (VA.getLocReg() == X86::ST0 ||
1845        VA.getLocReg() == X86::ST1) {
1846      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1847      // change the value to the FP stack register class.
1848      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1849        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1850      RetOps.push_back(ValToCopy);
1851      // Don't emit a copytoreg.
1852      continue;
1853    }
1854
1855    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1856    // which is returned in RAX / RDX.
1857    if (Subtarget->is64Bit()) {
1858      if (ValVT == MVT::x86mmx) {
1859        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1860          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1861          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1862                                  ValToCopy);
1863          // If we don't have SSE2 available, convert to v4f32 so the generated
1864          // register is legal.
1865          if (!Subtarget->hasSSE2())
1866            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1867        }
1868      }
1869    }
1870
1871    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1872    Flag = Chain.getValue(1);
1873    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1874  }
1875
1876  // The x86-64 ABIs require that for returning structs by value we copy
1877  // the sret argument into %rax/%eax (depending on ABI) for the return.
1878  // Win32 requires us to put the sret argument to %eax as well.
1879  // We saved the argument into a virtual register in the entry block,
1880  // so now we copy the value out and into %rax/%eax.
1881  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
1882      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
1883    MachineFunction &MF = DAG.getMachineFunction();
1884    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1885    unsigned Reg = FuncInfo->getSRetReturnReg();
1886    assert(Reg &&
1887           "SRetReturnReg should have been set in LowerFormalArguments().");
1888    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1889
1890    unsigned RetValReg
1891        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
1892          X86::RAX : X86::EAX;
1893    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
1894    Flag = Chain.getValue(1);
1895
1896    // RAX/EAX now acts like a return value.
1897    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
1898  }
1899
1900  RetOps[0] = Chain;  // Update chain.
1901
1902  // Add the flag if we have it.
1903  if (Flag.getNode())
1904    RetOps.push_back(Flag);
1905
1906  return DAG.getNode(X86ISD::RET_FLAG, dl,
1907                     MVT::Other, &RetOps[0], RetOps.size());
1908}
1909
1910bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1911  if (N->getNumValues() != 1)
1912    return false;
1913  if (!N->hasNUsesOfValue(1, 0))
1914    return false;
1915
1916  SDValue TCChain = Chain;
1917  SDNode *Copy = *N->use_begin();
1918  if (Copy->getOpcode() == ISD::CopyToReg) {
1919    // If the copy has a glue operand, we conservatively assume it isn't safe to
1920    // perform a tail call.
1921    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1922      return false;
1923    TCChain = Copy->getOperand(0);
1924  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1925    return false;
1926
1927  bool HasRet = false;
1928  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1929       UI != UE; ++UI) {
1930    if (UI->getOpcode() != X86ISD::RET_FLAG)
1931      return false;
1932    HasRet = true;
1933  }
1934
1935  if (!HasRet)
1936    return false;
1937
1938  Chain = TCChain;
1939  return true;
1940}
1941
1942MVT
1943X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
1944                                            ISD::NodeType ExtendKind) const {
1945  MVT ReturnMVT;
1946  // TODO: Is this also valid on 32-bit?
1947  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1948    ReturnMVT = MVT::i8;
1949  else
1950    ReturnMVT = MVT::i32;
1951
1952  MVT MinVT = getRegisterType(ReturnMVT);
1953  return VT.bitsLT(MinVT) ? MinVT : VT;
1954}
1955
1956/// LowerCallResult - Lower the result values of a call into the
1957/// appropriate copies out of appropriate physical registers.
1958///
1959SDValue
1960X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1961                                   CallingConv::ID CallConv, bool isVarArg,
1962                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1963                                   SDLoc dl, SelectionDAG &DAG,
1964                                   SmallVectorImpl<SDValue> &InVals) const {
1965
1966  // Assign locations to each value returned by this call.
1967  SmallVector<CCValAssign, 16> RVLocs;
1968  bool Is64Bit = Subtarget->is64Bit();
1969  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1970                 getTargetMachine(), RVLocs, *DAG.getContext());
1971  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1972
1973  // Copy all of the result registers out of their specified physreg.
1974  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1975    CCValAssign &VA = RVLocs[i];
1976    EVT CopyVT = VA.getValVT();
1977
1978    // If this is x86-64, and we disabled SSE, we can't return FP values
1979    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1980        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1981      report_fatal_error("SSE register return with SSE disabled");
1982    }
1983
1984    SDValue Val;
1985
1986    // If this is a call to a function that returns an fp value on the floating
1987    // point stack, we must guarantee the value is popped from the stack, so
1988    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1989    // if the return value is not used. We use the FpPOP_RETVAL instruction
1990    // instead.
1991    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1992      // If we prefer to use the value in xmm registers, copy it out as f80 and
1993      // use a truncate to move it from fp stack reg to xmm reg.
1994      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1995      SDValue Ops[] = { Chain, InFlag };
1996      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1997                                         MVT::Other, MVT::Glue, Ops), 1);
1998      Val = Chain.getValue(0);
1999
2000      // Round the f80 to the right size, which also moves it to the appropriate
2001      // xmm register.
2002      if (CopyVT != VA.getValVT())
2003        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2004                          // This truncation won't change the value.
2005                          DAG.getIntPtrConstant(1));
2006    } else {
2007      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2008                                 CopyVT, InFlag).getValue(1);
2009      Val = Chain.getValue(0);
2010    }
2011    InFlag = Chain.getValue(2);
2012    InVals.push_back(Val);
2013  }
2014
2015  return Chain;
2016}
2017
2018//===----------------------------------------------------------------------===//
2019//                C & StdCall & Fast Calling Convention implementation
2020//===----------------------------------------------------------------------===//
2021//  StdCall calling convention seems to be standard for many Windows' API
2022//  routines and around. It differs from C calling convention just a little:
2023//  callee should clean up the stack, not caller. Symbols should be also
2024//  decorated in some fancy way :) It doesn't support any vector arguments.
2025//  For info on fast calling convention see Fast Calling Convention (tail call)
2026//  implementation LowerX86_32FastCCCallTo.
2027
2028/// CallIsStructReturn - Determines whether a call uses struct return
2029/// semantics.
2030enum StructReturnType {
2031  NotStructReturn,
2032  RegStructReturn,
2033  StackStructReturn
2034};
2035static StructReturnType
2036callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2037  if (Outs.empty())
2038    return NotStructReturn;
2039
2040  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2041  if (!Flags.isSRet())
2042    return NotStructReturn;
2043  if (Flags.isInReg())
2044    return RegStructReturn;
2045  return StackStructReturn;
2046}
2047
2048/// ArgsAreStructReturn - Determines whether a function uses struct
2049/// return semantics.
2050static StructReturnType
2051argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2052  if (Ins.empty())
2053    return NotStructReturn;
2054
2055  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2056  if (!Flags.isSRet())
2057    return NotStructReturn;
2058  if (Flags.isInReg())
2059    return RegStructReturn;
2060  return StackStructReturn;
2061}
2062
2063/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
2064/// by "Src" to address "Dst" with size and alignment information specified by
2065/// the specific parameter attribute. The copy will be passed as a byval
2066/// function parameter.
2067static SDValue
2068CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2069                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2070                          SDLoc dl) {
2071  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2072
2073  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2074                       /*isVolatile*/false, /*AlwaysInline=*/true,
2075                       MachinePointerInfo(), MachinePointerInfo());
2076}
2077
2078/// IsTailCallConvention - Return true if the calling convention is one that
2079/// supports tail call optimization.
2080static bool IsTailCallConvention(CallingConv::ID CC) {
2081  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2082          CC == CallingConv::HiPE);
2083}
2084
2085/// \brief Return true if the calling convention is a C calling convention.
2086static bool IsCCallConvention(CallingConv::ID CC) {
2087  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2088          CC == CallingConv::X86_64_SysV);
2089}
2090
2091bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2092  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2093    return false;
2094
2095  CallSite CS(CI);
2096  CallingConv::ID CalleeCC = CS.getCallingConv();
2097  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2098    return false;
2099
2100  return true;
2101}
2102
2103/// FuncIsMadeTailCallSafe - Return true if the function is being made into
2104/// a tailcall target by changing its ABI.
2105static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2106                                   bool GuaranteedTailCallOpt) {
2107  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2108}
2109
2110SDValue
2111X86TargetLowering::LowerMemArgument(SDValue Chain,
2112                                    CallingConv::ID CallConv,
2113                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2114                                    SDLoc dl, SelectionDAG &DAG,
2115                                    const CCValAssign &VA,
2116                                    MachineFrameInfo *MFI,
2117                                    unsigned i) const {
2118  // Create the nodes corresponding to a load from this parameter slot.
2119  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2120  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
2121                              getTargetMachine().Options.GuaranteedTailCallOpt);
2122  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2123  EVT ValVT;
2124
2125  // If value is passed by pointer we have address passed instead of the value
2126  // itself.
2127  if (VA.getLocInfo() == CCValAssign::Indirect)
2128    ValVT = VA.getLocVT();
2129  else
2130    ValVT = VA.getValVT();
2131
2132  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2133  // changed with more analysis.
2134  // In case of tail call optimization mark all arguments mutable. Since they
2135  // could be overwritten by lowering of arguments in case of a tail call.
2136  if (Flags.isByVal()) {
2137    unsigned Bytes = Flags.getByValSize();
2138    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2139    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2140    return DAG.getFrameIndex(FI, getPointerTy());
2141  } else {
2142    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2143                                    VA.getLocMemOffset(), isImmutable);
2144    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2145    return DAG.getLoad(ValVT, dl, Chain, FIN,
2146                       MachinePointerInfo::getFixedStack(FI),
2147                       false, false, false, 0);
2148  }
2149}
2150
2151SDValue
2152X86TargetLowering::LowerFormalArguments(SDValue Chain,
2153                                        CallingConv::ID CallConv,
2154                                        bool isVarArg,
2155                                      const SmallVectorImpl<ISD::InputArg> &Ins,
2156                                        SDLoc dl,
2157                                        SelectionDAG &DAG,
2158                                        SmallVectorImpl<SDValue> &InVals)
2159                                          const {
2160  MachineFunction &MF = DAG.getMachineFunction();
2161  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2162
2163  const Function* Fn = MF.getFunction();
2164  if (Fn->hasExternalLinkage() &&
2165      Subtarget->isTargetCygMing() &&
2166      Fn->getName() == "main")
2167    FuncInfo->setForceFramePointer(true);
2168
2169  MachineFrameInfo *MFI = MF.getFrameInfo();
2170  bool Is64Bit = Subtarget->is64Bit();
2171  bool IsWindows = Subtarget->isTargetWindows();
2172  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2173
2174  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2175         "Var args not supported with calling convention fastcc, ghc or hipe");
2176
2177  // Assign locations to all of the incoming arguments.
2178  SmallVector<CCValAssign, 16> ArgLocs;
2179  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2180                 ArgLocs, *DAG.getContext());
2181
2182  // Allocate shadow area for Win64
2183  if (IsWin64)
2184    CCInfo.AllocateStack(32, 8);
2185
2186  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2187
2188  unsigned LastVal = ~0U;
2189  SDValue ArgValue;
2190  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2191    CCValAssign &VA = ArgLocs[i];
2192    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2193    // places.
2194    assert(VA.getValNo() != LastVal &&
2195           "Don't support value assigned to multiple locs yet");
2196    (void)LastVal;
2197    LastVal = VA.getValNo();
2198
2199    if (VA.isRegLoc()) {
2200      EVT RegVT = VA.getLocVT();
2201      const TargetRegisterClass *RC;
2202      if (RegVT == MVT::i32)
2203        RC = &X86::GR32RegClass;
2204      else if (Is64Bit && RegVT == MVT::i64)
2205        RC = &X86::GR64RegClass;
2206      else if (RegVT == MVT::f32)
2207        RC = &X86::FR32RegClass;
2208      else if (RegVT == MVT::f64)
2209        RC = &X86::FR64RegClass;
2210      else if (RegVT.is512BitVector())
2211        RC = &X86::VR512RegClass;
2212      else if (RegVT.is256BitVector())
2213        RC = &X86::VR256RegClass;
2214      else if (RegVT.is128BitVector())
2215        RC = &X86::VR128RegClass;
2216      else if (RegVT == MVT::x86mmx)
2217        RC = &X86::VR64RegClass;
2218      else if (RegVT == MVT::v8i1)
2219        RC = &X86::VK8RegClass;
2220      else if (RegVT == MVT::v16i1)
2221        RC = &X86::VK16RegClass;
2222      else
2223        llvm_unreachable("Unknown argument type!");
2224
2225      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2226      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2227
2228      // If this is an 8 or 16-bit value, it is really passed promoted to 32
2229      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2230      // right size.
2231      if (VA.getLocInfo() == CCValAssign::SExt)
2232        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2233                               DAG.getValueType(VA.getValVT()));
2234      else if (VA.getLocInfo() == CCValAssign::ZExt)
2235        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2236                               DAG.getValueType(VA.getValVT()));
2237      else if (VA.getLocInfo() == CCValAssign::BCvt)
2238        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2239
2240      if (VA.isExtInLoc()) {
2241        // Handle MMX values passed in XMM regs.
2242        if (RegVT.isVector())
2243          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2244        else
2245          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2246      }
2247    } else {
2248      assert(VA.isMemLoc());
2249      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2250    }
2251
2252    // If value is passed via pointer - do a load.
2253    if (VA.getLocInfo() == CCValAssign::Indirect)
2254      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2255                             MachinePointerInfo(), false, false, false, 0);
2256
2257    InVals.push_back(ArgValue);
2258  }
2259
2260  // The x86-64 ABIs require that for returning structs by value we copy
2261  // the sret argument into %rax/%eax (depending on ABI) for the return.
2262  // Win32 requires us to put the sret argument to %eax as well.
2263  // Save the argument into a virtual register so that we can access it
2264  // from the return points.
2265  if (MF.getFunction()->hasStructRetAttr() &&
2266      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
2267    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2268    unsigned Reg = FuncInfo->getSRetReturnReg();
2269    if (!Reg) {
2270      MVT PtrTy = getPointerTy();
2271      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2272      FuncInfo->setSRetReturnReg(Reg);
2273    }
2274    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
2275    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2276  }
2277
2278  unsigned StackSize = CCInfo.getNextStackOffset();
2279  // Align stack specially for tail calls.
2280  if (FuncIsMadeTailCallSafe(CallConv,
2281                             MF.getTarget().Options.GuaranteedTailCallOpt))
2282    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2283
2284  // If the function takes variable number of arguments, make a frame index for
2285  // the start of the first vararg value... for expansion of llvm.va_start.
2286  if (isVarArg) {
2287    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2288                    CallConv != CallingConv::X86_ThisCall)) {
2289      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
2290    }
2291    if (Is64Bit) {
2292      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
2293
2294      // FIXME: We should really autogenerate these arrays
2295      static const uint16_t GPR64ArgRegsWin64[] = {
2296        X86::RCX, X86::RDX, X86::R8,  X86::R9
2297      };
2298      static const uint16_t GPR64ArgRegs64Bit[] = {
2299        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2300      };
2301      static const uint16_t XMMArgRegs64Bit[] = {
2302        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2303        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2304      };
2305      const uint16_t *GPR64ArgRegs;
2306      unsigned NumXMMRegs = 0;
2307
2308      if (IsWin64) {
2309        // The XMM registers which might contain var arg parameters are shadowed
2310        // in their paired GPR.  So we only need to save the GPR to their home
2311        // slots.
2312        TotalNumIntRegs = 4;
2313        GPR64ArgRegs = GPR64ArgRegsWin64;
2314      } else {
2315        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
2316        GPR64ArgRegs = GPR64ArgRegs64Bit;
2317
2318        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
2319                                                TotalNumXMMRegs);
2320      }
2321      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
2322                                                       TotalNumIntRegs);
2323
2324      bool NoImplicitFloatOps = Fn->getAttributes().
2325        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2326      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2327             "SSE register cannot be used when SSE is disabled!");
2328      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
2329               NoImplicitFloatOps) &&
2330             "SSE register cannot be used when SSE is disabled!");
2331      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2332          !Subtarget->hasSSE1())
2333        // Kernel mode asks for SSE to be disabled, so don't push them
2334        // on the stack.
2335        TotalNumXMMRegs = 0;
2336
2337      if (IsWin64) {
2338        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
2339        // Get to the caller-allocated home save location.  Add 8 to account
2340        // for the return address.
2341        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2342        FuncInfo->setRegSaveFrameIndex(
2343          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2344        // Fixup to set vararg frame on shadow area (4 x i64).
2345        if (NumIntRegs < 4)
2346          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2347      } else {
2348        // For X86-64, if there are vararg parameters that are passed via
2349        // registers, then we must store them to their spots on the stack so
2350        // they may be loaded by deferencing the result of va_next.
2351        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2352        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2353        FuncInfo->setRegSaveFrameIndex(
2354          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2355                               false));
2356      }
2357
2358      // Store the integer parameter registers.
2359      SmallVector<SDValue, 8> MemOps;
2360      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2361                                        getPointerTy());
2362      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2363      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2364        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2365                                  DAG.getIntPtrConstant(Offset));
2366        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2367                                     &X86::GR64RegClass);
2368        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2369        SDValue Store =
2370          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2371                       MachinePointerInfo::getFixedStack(
2372                         FuncInfo->getRegSaveFrameIndex(), Offset),
2373                       false, false, 0);
2374        MemOps.push_back(Store);
2375        Offset += 8;
2376      }
2377
2378      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2379        // Now store the XMM (fp + vector) parameter registers.
2380        SmallVector<SDValue, 11> SaveXMMOps;
2381        SaveXMMOps.push_back(Chain);
2382
2383        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2384        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2385        SaveXMMOps.push_back(ALVal);
2386
2387        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2388                               FuncInfo->getRegSaveFrameIndex()));
2389        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2390                               FuncInfo->getVarArgsFPOffset()));
2391
2392        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2393          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2394                                       &X86::VR128RegClass);
2395          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2396          SaveXMMOps.push_back(Val);
2397        }
2398        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2399                                     MVT::Other,
2400                                     &SaveXMMOps[0], SaveXMMOps.size()));
2401      }
2402
2403      if (!MemOps.empty())
2404        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2405                            &MemOps[0], MemOps.size());
2406    }
2407  }
2408
2409  // Some CCs need callee pop.
2410  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2411                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2412    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2413  } else {
2414    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2415    // If this is an sret function, the return should pop the hidden pointer.
2416    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2417        argsAreStructReturn(Ins) == StackStructReturn)
2418      FuncInfo->setBytesToPopOnReturn(4);
2419  }
2420
2421  if (!Is64Bit) {
2422    // RegSaveFrameIndex is X86-64 only.
2423    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2424    if (CallConv == CallingConv::X86_FastCall ||
2425        CallConv == CallingConv::X86_ThisCall)
2426      // fastcc functions can't have varargs.
2427      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2428  }
2429
2430  FuncInfo->setArgumentStackSize(StackSize);
2431
2432  return Chain;
2433}
2434
2435SDValue
2436X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2437                                    SDValue StackPtr, SDValue Arg,
2438                                    SDLoc dl, SelectionDAG &DAG,
2439                                    const CCValAssign &VA,
2440                                    ISD::ArgFlagsTy Flags) const {
2441  unsigned LocMemOffset = VA.getLocMemOffset();
2442  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2443  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2444  if (Flags.isByVal())
2445    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2446
2447  return DAG.getStore(Chain, dl, Arg, PtrOff,
2448                      MachinePointerInfo::getStack(LocMemOffset),
2449                      false, false, 0);
2450}
2451
2452/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2453/// optimization is performed and it is required.
2454SDValue
2455X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2456                                           SDValue &OutRetAddr, SDValue Chain,
2457                                           bool IsTailCall, bool Is64Bit,
2458                                           int FPDiff, SDLoc dl) const {
2459  // Adjust the Return address stack slot.
2460  EVT VT = getPointerTy();
2461  OutRetAddr = getReturnAddressFrameIndex(DAG);
2462
2463  // Load the "old" Return address.
2464  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2465                           false, false, false, 0);
2466  return SDValue(OutRetAddr.getNode(), 1);
2467}
2468
2469/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2470/// optimization is performed and it is required (FPDiff!=0).
2471static SDValue
2472EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
2473                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
2474                         unsigned SlotSize, int FPDiff, SDLoc dl) {
2475  // Store the return address to the appropriate stack slot.
2476  if (!FPDiff) return Chain;
2477  // Calculate the new stack slot for the return address.
2478  int NewReturnAddrFI =
2479    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2480                                         false);
2481  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2482  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2483                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2484                       false, false, 0);
2485  return Chain;
2486}
2487
2488SDValue
2489X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2490                             SmallVectorImpl<SDValue> &InVals) const {
2491  SelectionDAG &DAG                     = CLI.DAG;
2492  SDLoc &dl                             = CLI.DL;
2493  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2494  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2495  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2496  SDValue Chain                         = CLI.Chain;
2497  SDValue Callee                        = CLI.Callee;
2498  CallingConv::ID CallConv              = CLI.CallConv;
2499  bool &isTailCall                      = CLI.IsTailCall;
2500  bool isVarArg                         = CLI.IsVarArg;
2501
2502  MachineFunction &MF = DAG.getMachineFunction();
2503  bool Is64Bit        = Subtarget->is64Bit();
2504  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2505  bool IsWindows      = Subtarget->isTargetWindows();
2506  StructReturnType SR = callIsStructReturn(Outs);
2507  bool IsSibcall      = false;
2508
2509  if (MF.getTarget().Options.DisableTailCalls)
2510    isTailCall = false;
2511
2512  if (isTailCall) {
2513    // Check if it's really possible to do a tail call.
2514    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2515                    isVarArg, SR != NotStructReturn,
2516                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2517                    Outs, OutVals, Ins, DAG);
2518
2519    // Sibcalls are automatically detected tailcalls which do not require
2520    // ABI changes.
2521    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2522      IsSibcall = true;
2523
2524    if (isTailCall)
2525      ++NumTailCalls;
2526  }
2527
2528  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2529         "Var args not supported with calling convention fastcc, ghc or hipe");
2530
2531  // Analyze operands of the call, assigning locations to each operand.
2532  SmallVector<CCValAssign, 16> ArgLocs;
2533  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2534                 ArgLocs, *DAG.getContext());
2535
2536  // Allocate shadow area for Win64
2537  if (IsWin64)
2538    CCInfo.AllocateStack(32, 8);
2539
2540  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2541
2542  // Get a count of how many bytes are to be pushed on the stack.
2543  unsigned NumBytes = CCInfo.getNextStackOffset();
2544  if (IsSibcall)
2545    // This is a sibcall. The memory operands are available in caller's
2546    // own caller's stack.
2547    NumBytes = 0;
2548  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2549           IsTailCallConvention(CallConv))
2550    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2551
2552  int FPDiff = 0;
2553  if (isTailCall && !IsSibcall) {
2554    // Lower arguments at fp - stackoffset + fpdiff.
2555    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2556    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2557
2558    FPDiff = NumBytesCallerPushed - NumBytes;
2559
2560    // Set the delta of movement of the returnaddr stackslot.
2561    // But only set if delta is greater than previous delta.
2562    if (FPDiff < X86Info->getTCReturnAddrDelta())
2563      X86Info->setTCReturnAddrDelta(FPDiff);
2564  }
2565
2566  if (!IsSibcall)
2567    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
2568                                 dl);
2569
2570  SDValue RetAddrFrIdx;
2571  // Load return address for tail calls.
2572  if (isTailCall && FPDiff)
2573    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2574                                    Is64Bit, FPDiff, dl);
2575
2576  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2577  SmallVector<SDValue, 8> MemOpChains;
2578  SDValue StackPtr;
2579
2580  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2581  // of tail call optimization arguments are handle later.
2582  const X86RegisterInfo *RegInfo =
2583    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
2584  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2585    CCValAssign &VA = ArgLocs[i];
2586    EVT RegVT = VA.getLocVT();
2587    SDValue Arg = OutVals[i];
2588    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2589    bool isByVal = Flags.isByVal();
2590
2591    // Promote the value if needed.
2592    switch (VA.getLocInfo()) {
2593    default: llvm_unreachable("Unknown loc info!");
2594    case CCValAssign::Full: break;
2595    case CCValAssign::SExt:
2596      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2597      break;
2598    case CCValAssign::ZExt:
2599      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2600      break;
2601    case CCValAssign::AExt:
2602      if (RegVT.is128BitVector()) {
2603        // Special case: passing MMX values in XMM registers.
2604        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2605        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2606        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2607      } else
2608        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2609      break;
2610    case CCValAssign::BCvt:
2611      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2612      break;
2613    case CCValAssign::Indirect: {
2614      // Store the argument.
2615      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2616      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2617      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2618                           MachinePointerInfo::getFixedStack(FI),
2619                           false, false, 0);
2620      Arg = SpillSlot;
2621      break;
2622    }
2623    }
2624
2625    if (VA.isRegLoc()) {
2626      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2627      if (isVarArg && IsWin64) {
2628        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2629        // shadow reg if callee is a varargs function.
2630        unsigned ShadowReg = 0;
2631        switch (VA.getLocReg()) {
2632        case X86::XMM0: ShadowReg = X86::RCX; break;
2633        case X86::XMM1: ShadowReg = X86::RDX; break;
2634        case X86::XMM2: ShadowReg = X86::R8; break;
2635        case X86::XMM3: ShadowReg = X86::R9; break;
2636        }
2637        if (ShadowReg)
2638          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2639      }
2640    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2641      assert(VA.isMemLoc());
2642      if (StackPtr.getNode() == 0)
2643        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2644                                      getPointerTy());
2645      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2646                                             dl, DAG, VA, Flags));
2647    }
2648  }
2649
2650  if (!MemOpChains.empty())
2651    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2652                        &MemOpChains[0], MemOpChains.size());
2653
2654  if (Subtarget->isPICStyleGOT()) {
2655    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2656    // GOT pointer.
2657    if (!isTailCall) {
2658      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2659               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2660    } else {
2661      // If we are tail calling and generating PIC/GOT style code load the
2662      // address of the callee into ECX. The value in ecx is used as target of
2663      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2664      // for tail calls on PIC/GOT architectures. Normally we would just put the
2665      // address of GOT into ebx and then call target@PLT. But for tail calls
2666      // ebx would be restored (since ebx is callee saved) before jumping to the
2667      // target@PLT.
2668
2669      // Note: The actual moving to ECX is done further down.
2670      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2671      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2672          !G->getGlobal()->hasProtectedVisibility())
2673        Callee = LowerGlobalAddress(Callee, DAG);
2674      else if (isa<ExternalSymbolSDNode>(Callee))
2675        Callee = LowerExternalSymbol(Callee, DAG);
2676    }
2677  }
2678
2679  if (Is64Bit && isVarArg && !IsWin64) {
2680    // From AMD64 ABI document:
2681    // For calls that may call functions that use varargs or stdargs
2682    // (prototype-less calls or calls to functions containing ellipsis (...) in
2683    // the declaration) %al is used as hidden argument to specify the number
2684    // of SSE registers used. The contents of %al do not need to match exactly
2685    // the number of registers, but must be an ubound on the number of SSE
2686    // registers used and is in the range 0 - 8 inclusive.
2687
2688    // Count the number of XMM registers allocated.
2689    static const uint16_t XMMArgRegs[] = {
2690      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2691      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2692    };
2693    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2694    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2695           && "SSE registers cannot be used when SSE is disabled");
2696
2697    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2698                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
2699  }
2700
2701  // For tail calls lower the arguments to the 'real' stack slot.
2702  if (isTailCall) {
2703    // Force all the incoming stack arguments to be loaded from the stack
2704    // before any new outgoing arguments are stored to the stack, because the
2705    // outgoing stack slots may alias the incoming argument stack slots, and
2706    // the alias isn't otherwise explicit. This is slightly more conservative
2707    // than necessary, because it means that each store effectively depends
2708    // on every argument instead of just those arguments it would clobber.
2709    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2710
2711    SmallVector<SDValue, 8> MemOpChains2;
2712    SDValue FIN;
2713    int FI = 0;
2714    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2715      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2716        CCValAssign &VA = ArgLocs[i];
2717        if (VA.isRegLoc())
2718          continue;
2719        assert(VA.isMemLoc());
2720        SDValue Arg = OutVals[i];
2721        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2722        // Create frame index.
2723        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2724        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2725        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2726        FIN = DAG.getFrameIndex(FI, getPointerTy());
2727
2728        if (Flags.isByVal()) {
2729          // Copy relative to framepointer.
2730          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2731          if (StackPtr.getNode() == 0)
2732            StackPtr = DAG.getCopyFromReg(Chain, dl,
2733                                          RegInfo->getStackRegister(),
2734                                          getPointerTy());
2735          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2736
2737          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2738                                                           ArgChain,
2739                                                           Flags, DAG, dl));
2740        } else {
2741          // Store relative to framepointer.
2742          MemOpChains2.push_back(
2743            DAG.getStore(ArgChain, dl, Arg, FIN,
2744                         MachinePointerInfo::getFixedStack(FI),
2745                         false, false, 0));
2746        }
2747      }
2748    }
2749
2750    if (!MemOpChains2.empty())
2751      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2752                          &MemOpChains2[0], MemOpChains2.size());
2753
2754    // Store the return address to the appropriate stack slot.
2755    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2756                                     getPointerTy(), RegInfo->getSlotSize(),
2757                                     FPDiff, dl);
2758  }
2759
2760  // Build a sequence of copy-to-reg nodes chained together with token chain
2761  // and flag operands which copy the outgoing args into registers.
2762  SDValue InFlag;
2763  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2764    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2765                             RegsToPass[i].second, InFlag);
2766    InFlag = Chain.getValue(1);
2767  }
2768
2769  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2770    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2771    // In the 64-bit large code model, we have to make all calls
2772    // through a register, since the call instruction's 32-bit
2773    // pc-relative offset may not be large enough to hold the whole
2774    // address.
2775  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2776    // If the callee is a GlobalAddress node (quite common, every direct call
2777    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2778    // it.
2779
2780    // We should use extra load for direct calls to dllimported functions in
2781    // non-JIT mode.
2782    const GlobalValue *GV = G->getGlobal();
2783    if (!GV->hasDLLImportLinkage()) {
2784      unsigned char OpFlags = 0;
2785      bool ExtraLoad = false;
2786      unsigned WrapperKind = ISD::DELETED_NODE;
2787
2788      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2789      // external symbols most go through the PLT in PIC mode.  If the symbol
2790      // has hidden or protected visibility, or if it is static or local, then
2791      // we don't need to use the PLT - we can directly call it.
2792      if (Subtarget->isTargetELF() &&
2793          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2794          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2795        OpFlags = X86II::MO_PLT;
2796      } else if (Subtarget->isPICStyleStubAny() &&
2797                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2798                 (!Subtarget->getTargetTriple().isMacOSX() ||
2799                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2800        // PC-relative references to external symbols should go through $stub,
2801        // unless we're building with the leopard linker or later, which
2802        // automatically synthesizes these stubs.
2803        OpFlags = X86II::MO_DARWIN_STUB;
2804      } else if (Subtarget->isPICStyleRIPRel() &&
2805                 isa<Function>(GV) &&
2806                 cast<Function>(GV)->getAttributes().
2807                   hasAttribute(AttributeSet::FunctionIndex,
2808                                Attribute::NonLazyBind)) {
2809        // If the function is marked as non-lazy, generate an indirect call
2810        // which loads from the GOT directly. This avoids runtime overhead
2811        // at the cost of eager binding (and one extra byte of encoding).
2812        OpFlags = X86II::MO_GOTPCREL;
2813        WrapperKind = X86ISD::WrapperRIP;
2814        ExtraLoad = true;
2815      }
2816
2817      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2818                                          G->getOffset(), OpFlags);
2819
2820      // Add a wrapper if needed.
2821      if (WrapperKind != ISD::DELETED_NODE)
2822        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2823      // Add extra indirection if needed.
2824      if (ExtraLoad)
2825        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2826                             MachinePointerInfo::getGOT(),
2827                             false, false, false, 0);
2828    }
2829  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2830    unsigned char OpFlags = 0;
2831
2832    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2833    // external symbols should go through the PLT.
2834    if (Subtarget->isTargetELF() &&
2835        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2836      OpFlags = X86II::MO_PLT;
2837    } else if (Subtarget->isPICStyleStubAny() &&
2838               (!Subtarget->getTargetTriple().isMacOSX() ||
2839                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2840      // PC-relative references to external symbols should go through $stub,
2841      // unless we're building with the leopard linker or later, which
2842      // automatically synthesizes these stubs.
2843      OpFlags = X86II::MO_DARWIN_STUB;
2844    }
2845
2846    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2847                                         OpFlags);
2848  }
2849
2850  // Returns a chain & a flag for retval copy to use.
2851  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2852  SmallVector<SDValue, 8> Ops;
2853
2854  if (!IsSibcall && isTailCall) {
2855    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2856                           DAG.getIntPtrConstant(0, true), InFlag, dl);
2857    InFlag = Chain.getValue(1);
2858  }
2859
2860  Ops.push_back(Chain);
2861  Ops.push_back(Callee);
2862
2863  if (isTailCall)
2864    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2865
2866  // Add argument registers to the end of the list so that they are known live
2867  // into the call.
2868  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2869    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2870                                  RegsToPass[i].second.getValueType()));
2871
2872  // Add a register mask operand representing the call-preserved registers.
2873  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2874  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2875  assert(Mask && "Missing call preserved mask for calling convention");
2876  Ops.push_back(DAG.getRegisterMask(Mask));
2877
2878  if (InFlag.getNode())
2879    Ops.push_back(InFlag);
2880
2881  if (isTailCall) {
2882    // We used to do:
2883    //// If this is the first return lowered for this function, add the regs
2884    //// to the liveout set for the function.
2885    // This isn't right, although it's probably harmless on x86; liveouts
2886    // should be computed from returns not tail calls.  Consider a void
2887    // function making a tail call to a function returning int.
2888    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
2889  }
2890
2891  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2892  InFlag = Chain.getValue(1);
2893
2894  // Create the CALLSEQ_END node.
2895  unsigned NumBytesForCalleeToPush;
2896  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2897                       getTargetMachine().Options.GuaranteedTailCallOpt))
2898    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2899  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2900           SR == StackStructReturn)
2901    // If this is a call to a struct-return function, the callee
2902    // pops the hidden struct pointer, so we have to push it back.
2903    // This is common for Darwin/X86, Linux & Mingw32 targets.
2904    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2905    NumBytesForCalleeToPush = 4;
2906  else
2907    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2908
2909  // Returns a flag for retval copy to use.
2910  if (!IsSibcall) {
2911    Chain = DAG.getCALLSEQ_END(Chain,
2912                               DAG.getIntPtrConstant(NumBytes, true),
2913                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2914                                                     true),
2915                               InFlag, dl);
2916    InFlag = Chain.getValue(1);
2917  }
2918
2919  // Handle result values, copying them out of physregs into vregs that we
2920  // return.
2921  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2922                         Ins, dl, DAG, InVals);
2923}
2924
2925//===----------------------------------------------------------------------===//
2926//                Fast Calling Convention (tail call) implementation
2927//===----------------------------------------------------------------------===//
2928
2929//  Like std call, callee cleans arguments, convention except that ECX is
2930//  reserved for storing the tail called function address. Only 2 registers are
2931//  free for argument passing (inreg). Tail call optimization is performed
2932//  provided:
2933//                * tailcallopt is enabled
2934//                * caller/callee are fastcc
2935//  On X86_64 architecture with GOT-style position independent code only local
2936//  (within module) calls are supported at the moment.
2937//  To keep the stack aligned according to platform abi the function
2938//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2939//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2940//  If a tail called function callee has more arguments than the caller the
2941//  caller needs to make sure that there is room to move the RETADDR to. This is
2942//  achieved by reserving an area the size of the argument delta right after the
2943//  original REtADDR, but before the saved framepointer or the spilled registers
2944//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2945//  stack layout:
2946//    arg1
2947//    arg2
2948//    RETADDR
2949//    [ new RETADDR
2950//      move area ]
2951//    (possible EBP)
2952//    ESI
2953//    EDI
2954//    local1 ..
2955
2956/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2957/// for a 16 byte align requirement.
2958unsigned
2959X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2960                                               SelectionDAG& DAG) const {
2961  MachineFunction &MF = DAG.getMachineFunction();
2962  const TargetMachine &TM = MF.getTarget();
2963  const X86RegisterInfo *RegInfo =
2964    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
2965  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2966  unsigned StackAlignment = TFI.getStackAlignment();
2967  uint64_t AlignMask = StackAlignment - 1;
2968  int64_t Offset = StackSize;
2969  unsigned SlotSize = RegInfo->getSlotSize();
2970  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2971    // Number smaller than 12 so just add the difference.
2972    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2973  } else {
2974    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2975    Offset = ((~AlignMask) & Offset) + StackAlignment +
2976      (StackAlignment-SlotSize);
2977  }
2978  return Offset;
2979}
2980
2981/// MatchingStackOffset - Return true if the given stack call argument is
2982/// already available in the same position (relatively) of the caller's
2983/// incoming argument stack.
2984static
2985bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2986                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2987                         const X86InstrInfo *TII) {
2988  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2989  int FI = INT_MAX;
2990  if (Arg.getOpcode() == ISD::CopyFromReg) {
2991    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2992    if (!TargetRegisterInfo::isVirtualRegister(VR))
2993      return false;
2994    MachineInstr *Def = MRI->getVRegDef(VR);
2995    if (!Def)
2996      return false;
2997    if (!Flags.isByVal()) {
2998      if (!TII->isLoadFromStackSlot(Def, FI))
2999        return false;
3000    } else {
3001      unsigned Opcode = Def->getOpcode();
3002      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
3003          Def->getOperand(1).isFI()) {
3004        FI = Def->getOperand(1).getIndex();
3005        Bytes = Flags.getByValSize();
3006      } else
3007        return false;
3008    }
3009  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3010    if (Flags.isByVal())
3011      // ByVal argument is passed in as a pointer but it's now being
3012      // dereferenced. e.g.
3013      // define @foo(%struct.X* %A) {
3014      //   tail call @bar(%struct.X* byval %A)
3015      // }
3016      return false;
3017    SDValue Ptr = Ld->getBasePtr();
3018    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3019    if (!FINode)
3020      return false;
3021    FI = FINode->getIndex();
3022  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3023    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3024    FI = FINode->getIndex();
3025    Bytes = Flags.getByValSize();
3026  } else
3027    return false;
3028
3029  assert(FI != INT_MAX);
3030  if (!MFI->isFixedObjectIndex(FI))
3031    return false;
3032  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3033}
3034
3035/// IsEligibleForTailCallOptimization - Check whether the call is eligible
3036/// for tail call optimization. Targets which want to do tail call
3037/// optimization should implement this function.
3038bool
3039X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3040                                                     CallingConv::ID CalleeCC,
3041                                                     bool isVarArg,
3042                                                     bool isCalleeStructRet,
3043                                                     bool isCallerStructRet,
3044                                                     Type *RetTy,
3045                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
3046                                    const SmallVectorImpl<SDValue> &OutVals,
3047                                    const SmallVectorImpl<ISD::InputArg> &Ins,
3048                                                     SelectionDAG &DAG) const {
3049  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3050    return false;
3051
3052  // If -tailcallopt is specified, make fastcc functions tail-callable.
3053  const MachineFunction &MF = DAG.getMachineFunction();
3054  const Function *CallerF = MF.getFunction();
3055
3056  // If the function return type is x86_fp80 and the callee return type is not,
3057  // then the FP_EXTEND of the call result is not a nop. It's not safe to
3058  // perform a tailcall optimization here.
3059  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3060    return false;
3061
3062  CallingConv::ID CallerCC = CallerF->getCallingConv();
3063  bool CCMatch = CallerCC == CalleeCC;
3064  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3065  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3066
3067  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
3068    if (IsTailCallConvention(CalleeCC) && CCMatch)
3069      return true;
3070    return false;
3071  }
3072
3073  // Look for obvious safe cases to perform tail call optimization that do not
3074  // require ABI changes. This is what gcc calls sibcall.
3075
3076  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3077  // emit a special epilogue.
3078  const X86RegisterInfo *RegInfo =
3079    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
3080  if (RegInfo->needsStackRealignment(MF))
3081    return false;
3082
3083  // Also avoid sibcall optimization if either caller or callee uses struct
3084  // return semantics.
3085  if (isCalleeStructRet || isCallerStructRet)
3086    return false;
3087
3088  // An stdcall caller is expected to clean up its arguments; the callee
3089  // isn't going to do that.
3090  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
3091    return false;
3092
3093  // Do not sibcall optimize vararg calls unless all arguments are passed via
3094  // registers.
3095  if (isVarArg && !Outs.empty()) {
3096
3097    // Optimizing for varargs on Win64 is unlikely to be safe without
3098    // additional testing.
3099    if (IsCalleeWin64 || IsCallerWin64)
3100      return false;
3101
3102    SmallVector<CCValAssign, 16> ArgLocs;
3103    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
3104                   getTargetMachine(), ArgLocs, *DAG.getContext());
3105
3106    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3107    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3108      if (!ArgLocs[i].isRegLoc())
3109        return false;
3110  }
3111
3112  // If the call result is in ST0 / ST1, it needs to be popped off the x87
3113  // stack.  Therefore, if it's not used by the call it is not safe to optimize
3114  // this into a sibcall.
3115  bool Unused = false;
3116  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3117    if (!Ins[i].Used) {
3118      Unused = true;
3119      break;
3120    }
3121  }
3122  if (Unused) {
3123    SmallVector<CCValAssign, 16> RVLocs;
3124    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
3125                   getTargetMachine(), RVLocs, *DAG.getContext());
3126    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3127    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3128      CCValAssign &VA = RVLocs[i];
3129      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
3130        return false;
3131    }
3132  }
3133
3134  // If the calling conventions do not match, then we'd better make sure the
3135  // results are returned in the same way as what the caller expects.
3136  if (!CCMatch) {
3137    SmallVector<CCValAssign, 16> RVLocs1;
3138    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
3139                    getTargetMachine(), RVLocs1, *DAG.getContext());
3140    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3141
3142    SmallVector<CCValAssign, 16> RVLocs2;
3143    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
3144                    getTargetMachine(), RVLocs2, *DAG.getContext());
3145    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3146
3147    if (RVLocs1.size() != RVLocs2.size())
3148      return false;
3149    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3150      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3151        return false;
3152      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3153        return false;
3154      if (RVLocs1[i].isRegLoc()) {
3155        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3156          return false;
3157      } else {
3158        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3159          return false;
3160      }
3161    }
3162  }
3163
3164  // If the callee takes no arguments then go on to check the results of the
3165  // call.
3166  if (!Outs.empty()) {
3167    // Check if stack adjustment is needed. For now, do not do this if any
3168    // argument is passed on the stack.
3169    SmallVector<CCValAssign, 16> ArgLocs;
3170    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
3171                   getTargetMachine(), ArgLocs, *DAG.getContext());
3172
3173    // Allocate shadow area for Win64
3174    if (IsCalleeWin64)
3175      CCInfo.AllocateStack(32, 8);
3176
3177    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3178    if (CCInfo.getNextStackOffset()) {
3179      MachineFunction &MF = DAG.getMachineFunction();
3180      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3181        return false;
3182
3183      // Check if the arguments are already laid out in the right way as
3184      // the caller's fixed stack objects.
3185      MachineFrameInfo *MFI = MF.getFrameInfo();
3186      const MachineRegisterInfo *MRI = &MF.getRegInfo();
3187      const X86InstrInfo *TII =
3188        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
3189      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3190        CCValAssign &VA = ArgLocs[i];
3191        SDValue Arg = OutVals[i];
3192        ISD::ArgFlagsTy Flags = Outs[i].Flags;
3193        if (VA.getLocInfo() == CCValAssign::Indirect)
3194          return false;
3195        if (!VA.isRegLoc()) {
3196          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3197                                   MFI, MRI, TII))
3198            return false;
3199        }
3200      }
3201    }
3202
3203    // If the tailcall address may be in a register, then make sure it's
3204    // possible to register allocate for it. In 32-bit, the call address can
3205    // only target EAX, EDX, or ECX since the tail call must be scheduled after
3206    // callee-saved registers are restored. These happen to be the same
3207    // registers used to pass 'inreg' arguments so watch out for those.
3208    if (!Subtarget->is64Bit() &&
3209        ((!isa<GlobalAddressSDNode>(Callee) &&
3210          !isa<ExternalSymbolSDNode>(Callee)) ||
3211         getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
3212      unsigned NumInRegs = 0;
3213      // In PIC we need an extra register to formulate the address computation
3214      // for the callee.
3215      unsigned MaxInRegs =
3216          (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3217
3218      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3219        CCValAssign &VA = ArgLocs[i];
3220        if (!VA.isRegLoc())
3221          continue;
3222        unsigned Reg = VA.getLocReg();
3223        switch (Reg) {
3224        default: break;
3225        case X86::EAX: case X86::EDX: case X86::ECX:
3226          if (++NumInRegs == MaxInRegs)
3227            return false;
3228          break;
3229        }
3230      }
3231    }
3232  }
3233
3234  return true;
3235}
3236
3237FastISel *
3238X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3239                                  const TargetLibraryInfo *libInfo) const {
3240  return X86::createFastISel(funcInfo, libInfo);
3241}
3242
3243//===----------------------------------------------------------------------===//
3244//                           Other Lowering Hooks
3245//===----------------------------------------------------------------------===//
3246
3247static bool MayFoldLoad(SDValue Op) {
3248  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3249}
3250
3251static bool MayFoldIntoStore(SDValue Op) {
3252  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3253}
3254
3255static bool isTargetShuffle(unsigned Opcode) {
3256  switch(Opcode) {
3257  default: return false;
3258  case X86ISD::PSHUFD:
3259  case X86ISD::PSHUFHW:
3260  case X86ISD::PSHUFLW:
3261  case X86ISD::SHUFP:
3262  case X86ISD::PALIGNR:
3263  case X86ISD::MOVLHPS:
3264  case X86ISD::MOVLHPD:
3265  case X86ISD::MOVHLPS:
3266  case X86ISD::MOVLPS:
3267  case X86ISD::MOVLPD:
3268  case X86ISD::MOVSHDUP:
3269  case X86ISD::MOVSLDUP:
3270  case X86ISD::MOVDDUP:
3271  case X86ISD::MOVSS:
3272  case X86ISD::MOVSD:
3273  case X86ISD::UNPCKL:
3274  case X86ISD::UNPCKH:
3275  case X86ISD::VPERMILP:
3276  case X86ISD::VPERM2X128:
3277  case X86ISD::VPERMI:
3278    return true;
3279  }
3280}
3281
3282static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3283                                    SDValue V1, SelectionDAG &DAG) {
3284  switch(Opc) {
3285  default: llvm_unreachable("Unknown x86 shuffle node");
3286  case X86ISD::MOVSHDUP:
3287  case X86ISD::MOVSLDUP:
3288  case X86ISD::MOVDDUP:
3289    return DAG.getNode(Opc, dl, VT, V1);
3290  }
3291}
3292
3293static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3294                                    SDValue V1, unsigned TargetMask,
3295                                    SelectionDAG &DAG) {
3296  switch(Opc) {
3297  default: llvm_unreachable("Unknown x86 shuffle node");
3298  case X86ISD::PSHUFD:
3299  case X86ISD::PSHUFHW:
3300  case X86ISD::PSHUFLW:
3301  case X86ISD::VPERMILP:
3302  case X86ISD::VPERMI:
3303    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3304  }
3305}
3306
3307static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3308                                    SDValue V1, SDValue V2, unsigned TargetMask,
3309                                    SelectionDAG &DAG) {
3310  switch(Opc) {
3311  default: llvm_unreachable("Unknown x86 shuffle node");
3312  case X86ISD::PALIGNR:
3313  case X86ISD::SHUFP:
3314  case X86ISD::VPERM2X128:
3315    return DAG.getNode(Opc, dl, VT, V1, V2,
3316                       DAG.getConstant(TargetMask, MVT::i8));
3317  }
3318}
3319
3320static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3321                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
3322  switch(Opc) {
3323  default: llvm_unreachable("Unknown x86 shuffle node");
3324  case X86ISD::MOVLHPS:
3325  case X86ISD::MOVLHPD:
3326  case X86ISD::MOVHLPS:
3327  case X86ISD::MOVLPS:
3328  case X86ISD::MOVLPD:
3329  case X86ISD::MOVSS:
3330  case X86ISD::MOVSD:
3331  case X86ISD::UNPCKL:
3332  case X86ISD::UNPCKH:
3333    return DAG.getNode(Opc, dl, VT, V1, V2);
3334  }
3335}
3336
3337SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3338  MachineFunction &MF = DAG.getMachineFunction();
3339  const X86RegisterInfo *RegInfo =
3340    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
3341  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3342  int ReturnAddrIndex = FuncInfo->getRAIndex();
3343
3344  if (ReturnAddrIndex == 0) {
3345    // Set up a frame object for the return address.
3346    unsigned SlotSize = RegInfo->getSlotSize();
3347    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3348                                                           -(int64_t)SlotSize,
3349                                                           false);
3350    FuncInfo->setRAIndex(ReturnAddrIndex);
3351  }
3352
3353  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3354}
3355
3356bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3357                                       bool hasSymbolicDisplacement) {
3358  // Offset should fit into 32 bit immediate field.
3359  if (!isInt<32>(Offset))
3360    return false;
3361
3362  // If we don't have a symbolic displacement - we don't have any extra
3363  // restrictions.
3364  if (!hasSymbolicDisplacement)
3365    return true;
3366
3367  // FIXME: Some tweaks might be needed for medium code model.
3368  if (M != CodeModel::Small && M != CodeModel::Kernel)
3369    return false;
3370
3371  // For small code model we assume that latest object is 16MB before end of 31
3372  // bits boundary. We may also accept pretty large negative constants knowing
3373  // that all objects are in the positive half of address space.
3374  if (M == CodeModel::Small && Offset < 16*1024*1024)
3375    return true;
3376
3377  // For kernel code model we know that all object resist in the negative half
3378  // of 32bits address space. We may not accept negative offsets, since they may
3379  // be just off and we may accept pretty large positive ones.
3380  if (M == CodeModel::Kernel && Offset > 0)
3381    return true;
3382
3383  return false;
3384}
3385
3386/// isCalleePop - Determines whether the callee is required to pop its
3387/// own arguments. Callee pop is necessary to support tail calls.
3388bool X86::isCalleePop(CallingConv::ID CallingConv,
3389                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3390  if (IsVarArg)
3391    return false;
3392
3393  switch (CallingConv) {
3394  default:
3395    return false;
3396  case CallingConv::X86_StdCall:
3397    return !is64Bit;
3398  case CallingConv::X86_FastCall:
3399    return !is64Bit;
3400  case CallingConv::X86_ThisCall:
3401    return !is64Bit;
3402  case CallingConv::Fast:
3403    return TailCallOpt;
3404  case CallingConv::GHC:
3405    return TailCallOpt;
3406  case CallingConv::HiPE:
3407    return TailCallOpt;
3408  }
3409}
3410
3411/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3412/// specific condition code, returning the condition code and the LHS/RHS of the
3413/// comparison to make.
3414static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3415                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3416  if (!isFP) {
3417    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3418      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3419        // X > -1   -> X == 0, jump !sign.
3420        RHS = DAG.getConstant(0, RHS.getValueType());
3421        return X86::COND_NS;
3422      }
3423      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3424        // X < 0   -> X == 0, jump on sign.
3425        return X86::COND_S;
3426      }
3427      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3428        // X < 1   -> X <= 0
3429        RHS = DAG.getConstant(0, RHS.getValueType());
3430        return X86::COND_LE;
3431      }
3432    }
3433
3434    switch (SetCCOpcode) {
3435    default: llvm_unreachable("Invalid integer condition!");
3436    case ISD::SETEQ:  return X86::COND_E;
3437    case ISD::SETGT:  return X86::COND_G;
3438    case ISD::SETGE:  return X86::COND_GE;
3439    case ISD::SETLT:  return X86::COND_L;
3440    case ISD::SETLE:  return X86::COND_LE;
3441    case ISD::SETNE:  return X86::COND_NE;
3442    case ISD::SETULT: return X86::COND_B;
3443    case ISD::SETUGT: return X86::COND_A;
3444    case ISD::SETULE: return X86::COND_BE;
3445    case ISD::SETUGE: return X86::COND_AE;
3446    }
3447  }
3448
3449  // First determine if it is required or is profitable to flip the operands.
3450
3451  // If LHS is a foldable load, but RHS is not, flip the condition.
3452  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3453      !ISD::isNON_EXTLoad(RHS.getNode())) {
3454    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3455    std::swap(LHS, RHS);
3456  }
3457
3458  switch (SetCCOpcode) {
3459  default: break;
3460  case ISD::SETOLT:
3461  case ISD::SETOLE:
3462  case ISD::SETUGT:
3463  case ISD::SETUGE:
3464    std::swap(LHS, RHS);
3465    break;
3466  }
3467
3468  // On a floating point condition, the flags are set as follows:
3469  // ZF  PF  CF   op
3470  //  0 | 0 | 0 | X > Y
3471  //  0 | 0 | 1 | X < Y
3472  //  1 | 0 | 0 | X == Y
3473  //  1 | 1 | 1 | unordered
3474  switch (SetCCOpcode) {
3475  default: llvm_unreachable("Condcode should be pre-legalized away");
3476  case ISD::SETUEQ:
3477  case ISD::SETEQ:   return X86::COND_E;
3478  case ISD::SETOLT:              // flipped
3479  case ISD::SETOGT:
3480  case ISD::SETGT:   return X86::COND_A;
3481  case ISD::SETOLE:              // flipped
3482  case ISD::SETOGE:
3483  case ISD::SETGE:   return X86::COND_AE;
3484  case ISD::SETUGT:              // flipped
3485  case ISD::SETULT:
3486  case ISD::SETLT:   return X86::COND_B;
3487  case ISD::SETUGE:              // flipped
3488  case ISD::SETULE:
3489  case ISD::SETLE:   return X86::COND_BE;
3490  case ISD::SETONE:
3491  case ISD::SETNE:   return X86::COND_NE;
3492  case ISD::SETUO:   return X86::COND_P;
3493  case ISD::SETO:    return X86::COND_NP;
3494  case ISD::SETOEQ:
3495  case ISD::SETUNE:  return X86::COND_INVALID;
3496  }
3497}
3498
3499/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3500/// code. Current x86 isa includes the following FP cmov instructions:
3501/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3502static bool hasFPCMov(unsigned X86CC) {
3503  switch (X86CC) {
3504  default:
3505    return false;
3506  case X86::COND_B:
3507  case X86::COND_BE:
3508  case X86::COND_E:
3509  case X86::COND_P:
3510  case X86::COND_A:
3511  case X86::COND_AE:
3512  case X86::COND_NE:
3513  case X86::COND_NP:
3514    return true;
3515  }
3516}
3517
3518/// isFPImmLegal - Returns true if the target can instruction select the
3519/// specified FP immediate natively. If false, the legalizer will
3520/// materialize the FP immediate as a load from a constant pool.
3521bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3522  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3523    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3524      return true;
3525  }
3526  return false;
3527}
3528
3529/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3530/// the specified range (L, H].
3531static bool isUndefOrInRange(int Val, int Low, int Hi) {
3532  return (Val < 0) || (Val >= Low && Val < Hi);
3533}
3534
3535/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3536/// specified value.
3537static bool isUndefOrEqual(int Val, int CmpVal) {
3538  return (Val < 0 || Val == CmpVal);
3539}
3540
3541/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3542/// from position Pos and ending in Pos+Size, falls within the specified
3543/// sequential range (L, L+Pos]. or is undef.
3544static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3545                                       unsigned Pos, unsigned Size, int Low) {
3546  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3547    if (!isUndefOrEqual(Mask[i], Low))
3548      return false;
3549  return true;
3550}
3551
3552/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3553/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3554/// the second operand.
3555static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
3556  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3557    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3558  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3559    return (Mask[0] < 2 && Mask[1] < 2);
3560  return false;
3561}
3562
3563/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3564/// is suitable for input to PSHUFHW.
3565static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3566  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3567    return false;
3568
3569  // Lower quadword copied in order or undef.
3570  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3571    return false;
3572
3573  // Upper quadword shuffled.
3574  for (unsigned i = 4; i != 8; ++i)
3575    if (!isUndefOrInRange(Mask[i], 4, 8))
3576      return false;
3577
3578  if (VT == MVT::v16i16) {
3579    // Lower quadword copied in order or undef.
3580    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3581      return false;
3582
3583    // Upper quadword shuffled.
3584    for (unsigned i = 12; i != 16; ++i)
3585      if (!isUndefOrInRange(Mask[i], 12, 16))
3586        return false;
3587  }
3588
3589  return true;
3590}
3591
3592/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3593/// is suitable for input to PSHUFLW.
3594static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3595  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3596    return false;
3597
3598  // Upper quadword copied in order.
3599  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3600    return false;
3601
3602  // Lower quadword shuffled.
3603  for (unsigned i = 0; i != 4; ++i)
3604    if (!isUndefOrInRange(Mask[i], 0, 4))
3605      return false;
3606
3607  if (VT == MVT::v16i16) {
3608    // Upper quadword copied in order.
3609    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3610      return false;
3611
3612    // Lower quadword shuffled.
3613    for (unsigned i = 8; i != 12; ++i)
3614      if (!isUndefOrInRange(Mask[i], 8, 12))
3615        return false;
3616  }
3617
3618  return true;
3619}
3620
3621/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3622/// is suitable for input to PALIGNR.
3623static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
3624                          const X86Subtarget *Subtarget) {
3625  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
3626      (VT.is256BitVector() && !Subtarget->hasInt256()))
3627    return false;
3628
3629  unsigned NumElts = VT.getVectorNumElements();
3630  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
3631  unsigned NumLaneElts = NumElts/NumLanes;
3632
3633  // Do not handle 64-bit element shuffles with palignr.
3634  if (NumLaneElts == 2)
3635    return false;
3636
3637  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3638    unsigned i;
3639    for (i = 0; i != NumLaneElts; ++i) {
3640      if (Mask[i+l] >= 0)
3641        break;
3642    }
3643
3644    // Lane is all undef, go to next lane
3645    if (i == NumLaneElts)
3646      continue;
3647
3648    int Start = Mask[i+l];
3649
3650    // Make sure its in this lane in one of the sources
3651    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3652        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3653      return false;
3654
3655    // If not lane 0, then we must match lane 0
3656    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3657      return false;
3658
3659    // Correct second source to be contiguous with first source
3660    if (Start >= (int)NumElts)
3661      Start -= NumElts - NumLaneElts;
3662
3663    // Make sure we're shifting in the right direction.
3664    if (Start <= (int)(i+l))
3665      return false;
3666
3667    Start -= i;
3668
3669    // Check the rest of the elements to see if they are consecutive.
3670    for (++i; i != NumLaneElts; ++i) {
3671      int Idx = Mask[i+l];
3672
3673      // Make sure its in this lane
3674      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3675          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3676        return false;
3677
3678      // If not lane 0, then we must match lane 0
3679      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3680        return false;
3681
3682      if (Idx >= (int)NumElts)
3683        Idx -= NumElts - NumLaneElts;
3684
3685      if (!isUndefOrEqual(Idx, Start+i))
3686        return false;
3687
3688    }
3689  }
3690
3691  return true;
3692}
3693
3694/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3695/// the two vector operands have swapped position.
3696static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3697                                     unsigned NumElems) {
3698  for (unsigned i = 0; i != NumElems; ++i) {
3699    int idx = Mask[i];
3700    if (idx < 0)
3701      continue;
3702    else if (idx < (int)NumElems)
3703      Mask[i] = idx + NumElems;
3704    else
3705      Mask[i] = idx - NumElems;
3706  }
3707}
3708
3709/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3710/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3711/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3712/// reverse of what x86 shuffles want.
3713static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
3714
3715  unsigned NumElems = VT.getVectorNumElements();
3716  unsigned NumLanes = VT.getSizeInBits()/128;
3717  unsigned NumLaneElems = NumElems/NumLanes;
3718
3719  if (NumLaneElems != 2 && NumLaneElems != 4)
3720    return false;
3721
3722  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
3723  bool symetricMaskRequired =
3724    (VT.getSizeInBits() >= 256) && (EltSize == 32);
3725
3726  // VSHUFPSY divides the resulting vector into 4 chunks.
3727  // The sources are also splitted into 4 chunks, and each destination
3728  // chunk must come from a different source chunk.
3729  //
3730  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3731  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3732  //
3733  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3734  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3735  //
3736  // VSHUFPDY divides the resulting vector into 4 chunks.
3737  // The sources are also splitted into 4 chunks, and each destination
3738  // chunk must come from a different source chunk.
3739  //
3740  //  SRC1 =>      X3       X2       X1       X0
3741  //  SRC2 =>      Y3       Y2       Y1       Y0
3742  //
3743  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3744  //
3745  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
3746  unsigned HalfLaneElems = NumLaneElems/2;
3747  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3748    for (unsigned i = 0; i != NumLaneElems; ++i) {
3749      int Idx = Mask[i+l];
3750      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3751      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3752        return false;
3753      // For VSHUFPSY, the mask of the second half must be the same as the
3754      // first but with the appropriate offsets. This works in the same way as
3755      // VPERMILPS works with masks.
3756      if (!symetricMaskRequired || Idx < 0)
3757        continue;
3758      if (MaskVal[i] < 0) {
3759        MaskVal[i] = Idx - l;
3760        continue;
3761      }
3762      if ((signed)(Idx - l) != MaskVal[i])
3763        return false;
3764    }
3765  }
3766
3767  return true;
3768}
3769
3770/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3771/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3772static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
3773  if (!VT.is128BitVector())
3774    return false;
3775
3776  unsigned NumElems = VT.getVectorNumElements();
3777
3778  if (NumElems != 4)
3779    return false;
3780
3781  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3782  return isUndefOrEqual(Mask[0], 6) &&
3783         isUndefOrEqual(Mask[1], 7) &&
3784         isUndefOrEqual(Mask[2], 2) &&
3785         isUndefOrEqual(Mask[3], 3);
3786}
3787
3788/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3789/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3790/// <2, 3, 2, 3>
3791static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
3792  if (!VT.is128BitVector())
3793    return false;
3794
3795  unsigned NumElems = VT.getVectorNumElements();
3796
3797  if (NumElems != 4)
3798    return false;
3799
3800  return isUndefOrEqual(Mask[0], 2) &&
3801         isUndefOrEqual(Mask[1], 3) &&
3802         isUndefOrEqual(Mask[2], 2) &&
3803         isUndefOrEqual(Mask[3], 3);
3804}
3805
3806/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3807/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3808static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
3809  if (!VT.is128BitVector())
3810    return false;
3811
3812  unsigned NumElems = VT.getVectorNumElements();
3813
3814  if (NumElems != 2 && NumElems != 4)
3815    return false;
3816
3817  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3818    if (!isUndefOrEqual(Mask[i], i + NumElems))
3819      return false;
3820
3821  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3822    if (!isUndefOrEqual(Mask[i], i))
3823      return false;
3824
3825  return true;
3826}
3827
3828/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3829/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3830static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
3831  if (!VT.is128BitVector())
3832    return false;
3833
3834  unsigned NumElems = VT.getVectorNumElements();
3835
3836  if (NumElems != 2 && NumElems != 4)
3837    return false;
3838
3839  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3840    if (!isUndefOrEqual(Mask[i], i))
3841      return false;
3842
3843  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3844    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3845      return false;
3846
3847  return true;
3848}
3849
3850//
3851// Some special combinations that can be optimized.
3852//
3853static
3854SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
3855                               SelectionDAG &DAG) {
3856  MVT VT = SVOp->getSimpleValueType(0);
3857  SDLoc dl(SVOp);
3858
3859  if (VT != MVT::v8i32 && VT != MVT::v8f32)
3860    return SDValue();
3861
3862  ArrayRef<int> Mask = SVOp->getMask();
3863
3864  // These are the special masks that may be optimized.
3865  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
3866  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
3867  bool MatchEvenMask = true;
3868  bool MatchOddMask  = true;
3869  for (int i=0; i<8; ++i) {
3870    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
3871      MatchEvenMask = false;
3872    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
3873      MatchOddMask = false;
3874  }
3875
3876  if (!MatchEvenMask && !MatchOddMask)
3877    return SDValue();
3878
3879  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
3880
3881  SDValue Op0 = SVOp->getOperand(0);
3882  SDValue Op1 = SVOp->getOperand(1);
3883
3884  if (MatchEvenMask) {
3885    // Shift the second operand right to 32 bits.
3886    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
3887    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
3888  } else {
3889    // Shift the first operand left to 32 bits.
3890    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
3891    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
3892  }
3893  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
3894  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
3895}
3896
3897/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3898/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3899static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
3900                         bool HasInt256, bool V2IsSplat = false) {
3901
3902  assert(VT.getSizeInBits() >= 128 &&
3903         "Unsupported vector type for unpckl");
3904
3905  // AVX defines UNPCK* to operate independently on 128-bit lanes.
3906  unsigned NumLanes;
3907  unsigned NumOf256BitLanes;
3908  unsigned NumElts = VT.getVectorNumElements();
3909  if (VT.is256BitVector()) {
3910    if (NumElts != 4 && NumElts != 8 &&
3911        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3912    return false;
3913    NumLanes = 2;
3914    NumOf256BitLanes = 1;
3915  } else if (VT.is512BitVector()) {
3916    assert(VT.getScalarType().getSizeInBits() >= 32 &&
3917           "Unsupported vector type for unpckh");
3918    NumLanes = 2;
3919    NumOf256BitLanes = 2;
3920  } else {
3921    NumLanes = 1;
3922    NumOf256BitLanes = 1;
3923  }
3924
3925  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
3926  unsigned NumLaneElts = NumEltsInStride/NumLanes;
3927
3928  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
3929    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
3930      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
3931        int BitI  = Mask[l256*NumEltsInStride+l+i];
3932        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
3933        if (!isUndefOrEqual(BitI, j+l256*NumElts))
3934          return false;
3935        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
3936          return false;
3937        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
3938          return false;
3939      }
3940    }
3941  }
3942  return true;
3943}
3944
3945/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3946/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3947static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
3948                         bool HasInt256, bool V2IsSplat = false) {
3949  assert(VT.getSizeInBits() >= 128 &&
3950         "Unsupported vector type for unpckh");
3951
3952  // AVX defines UNPCK* to operate independently on 128-bit lanes.
3953  unsigned NumLanes;
3954  unsigned NumOf256BitLanes;
3955  unsigned NumElts = VT.getVectorNumElements();
3956  if (VT.is256BitVector()) {
3957    if (NumElts != 4 && NumElts != 8 &&
3958        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3959    return false;
3960    NumLanes = 2;
3961    NumOf256BitLanes = 1;
3962  } else if (VT.is512BitVector()) {
3963    assert(VT.getScalarType().getSizeInBits() >= 32 &&
3964           "Unsupported vector type for unpckh");
3965    NumLanes = 2;
3966    NumOf256BitLanes = 2;
3967  } else {
3968    NumLanes = 1;
3969    NumOf256BitLanes = 1;
3970  }
3971
3972  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
3973  unsigned NumLaneElts = NumEltsInStride/NumLanes;
3974
3975  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
3976    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
3977      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
3978        int BitI  = Mask[l256*NumEltsInStride+l+i];
3979        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
3980        if (!isUndefOrEqual(BitI, j+l256*NumElts))
3981          return false;
3982        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
3983          return false;
3984        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
3985          return false;
3986      }
3987    }
3988  }
3989  return true;
3990}
3991
3992/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3993/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3994/// <0, 0, 1, 1>
3995static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3996  unsigned NumElts = VT.getVectorNumElements();
3997  bool Is256BitVec = VT.is256BitVector();
3998
3999  if (VT.is512BitVector())
4000    return false;
4001  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4002         "Unsupported vector type for unpckh");
4003
4004  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4005      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4006    return false;
4007
4008  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4009  // FIXME: Need a better way to get rid of this, there's no latency difference
4010  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4011  // the former later. We should also remove the "_undef" special mask.
4012  if (NumElts == 4 && Is256BitVec)
4013    return false;
4014
4015  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4016  // independently on 128-bit lanes.
4017  unsigned NumLanes = VT.getSizeInBits()/128;
4018  unsigned NumLaneElts = NumElts/NumLanes;
4019
4020  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4021    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4022      int BitI  = Mask[l+i];
4023      int BitI1 = Mask[l+i+1];
4024
4025      if (!isUndefOrEqual(BitI, j))
4026        return false;
4027      if (!isUndefOrEqual(BitI1, j))
4028        return false;
4029    }
4030  }
4031
4032  return true;
4033}
4034
4035/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4036/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4037/// <2, 2, 3, 3>
4038static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4039  unsigned NumElts = VT.getVectorNumElements();
4040
4041  if (VT.is512BitVector())
4042    return false;
4043
4044  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4045         "Unsupported vector type for unpckh");
4046
4047  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4048      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4049    return false;
4050
4051  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4052  // independently on 128-bit lanes.
4053  unsigned NumLanes = VT.getSizeInBits()/128;
4054  unsigned NumLaneElts = NumElts/NumLanes;
4055
4056  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4057    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4058      int BitI  = Mask[l+i];
4059      int BitI1 = Mask[l+i+1];
4060      if (!isUndefOrEqual(BitI, j))
4061        return false;
4062      if (!isUndefOrEqual(BitI1, j))
4063        return false;
4064    }
4065  }
4066  return true;
4067}
4068
4069/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4070/// specifies a shuffle of elements that is suitable for input to MOVSS,
4071/// MOVSD, and MOVD, i.e. setting the lowest element.
4072static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4073  if (VT.getVectorElementType().getSizeInBits() < 32)
4074    return false;
4075  if (!VT.is128BitVector())
4076    return false;
4077
4078  unsigned NumElts = VT.getVectorNumElements();
4079
4080  if (!isUndefOrEqual(Mask[0], NumElts))
4081    return false;
4082
4083  for (unsigned i = 1; i != NumElts; ++i)
4084    if (!isUndefOrEqual(Mask[i], i))
4085      return false;
4086
4087  return true;
4088}
4089
4090/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4091/// as permutations between 128-bit chunks or halves. As an example: this
4092/// shuffle bellow:
4093///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4094/// The first half comes from the second half of V1 and the second half from the
4095/// the second half of V2.
4096static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4097  if (!HasFp256 || !VT.is256BitVector())
4098    return false;
4099
4100  // The shuffle result is divided into half A and half B. In total the two
4101  // sources have 4 halves, namely: C, D, E, F. The final values of A and
4102  // B must come from C, D, E or F.
4103  unsigned HalfSize = VT.getVectorNumElements()/2;
4104  bool MatchA = false, MatchB = false;
4105
4106  // Check if A comes from one of C, D, E, F.
4107  for (unsigned Half = 0; Half != 4; ++Half) {
4108    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4109      MatchA = true;
4110      break;
4111    }
4112  }
4113
4114  // Check if B comes from one of C, D, E, F.
4115  for (unsigned Half = 0; Half != 4; ++Half) {
4116    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4117      MatchB = true;
4118      break;
4119    }
4120  }
4121
4122  return MatchA && MatchB;
4123}
4124
4125/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4126/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4127static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4128  MVT VT = SVOp->getSimpleValueType(0);
4129
4130  unsigned HalfSize = VT.getVectorNumElements()/2;
4131
4132  unsigned FstHalf = 0, SndHalf = 0;
4133  for (unsigned i = 0; i < HalfSize; ++i) {
4134    if (SVOp->getMaskElt(i) > 0) {
4135      FstHalf = SVOp->getMaskElt(i)/HalfSize;
4136      break;
4137    }
4138  }
4139  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4140    if (SVOp->getMaskElt(i) > 0) {
4141      SndHalf = SVOp->getMaskElt(i)/HalfSize;
4142      break;
4143    }
4144  }
4145
4146  return (FstHalf | (SndHalf << 4));
4147}
4148
4149// Symetric in-lane mask. Each lane has 4 elements (for imm8)
4150static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4151  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4152  if (EltSize < 32)
4153    return false;
4154
4155  unsigned NumElts = VT.getVectorNumElements();
4156  Imm8 = 0;
4157  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4158    for (unsigned i = 0; i != NumElts; ++i) {
4159      if (Mask[i] < 0)
4160        continue;
4161      Imm8 |= Mask[i] << (i*2);
4162    }
4163    return true;
4164  }
4165
4166  unsigned LaneSize = 4;
4167  SmallVector<int, 4> MaskVal(LaneSize, -1);
4168
4169  for (unsigned l = 0; l != NumElts; l += LaneSize) {
4170    for (unsigned i = 0; i != LaneSize; ++i) {
4171      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4172        return false;
4173      if (Mask[i+l] < 0)
4174        continue;
4175      if (MaskVal[i] < 0) {
4176        MaskVal[i] = Mask[i+l] - l;
4177        Imm8 |= MaskVal[i] << (i*2);
4178        continue;
4179      }
4180      if (Mask[i+l] != (signed)(MaskVal[i]+l))
4181        return false;
4182    }
4183  }
4184  return true;
4185}
4186
4187/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4188/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4189/// Note that VPERMIL mask matching is different depending whether theunderlying
4190/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4191/// to the same elements of the low, but to the higher half of the source.
4192/// In VPERMILPD the two lanes could be shuffled independently of each other
4193/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4194static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4195  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4196  if (VT.getSizeInBits() < 256 || EltSize < 32)
4197    return false;
4198  bool symetricMaskRequired = (EltSize == 32);
4199  unsigned NumElts = VT.getVectorNumElements();
4200
4201  unsigned NumLanes = VT.getSizeInBits()/128;
4202  unsigned LaneSize = NumElts/NumLanes;
4203  // 2 or 4 elements in one lane
4204
4205  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4206  for (unsigned l = 0; l != NumElts; l += LaneSize) {
4207    for (unsigned i = 0; i != LaneSize; ++i) {
4208      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4209        return false;
4210      if (symetricMaskRequired) {
4211        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4212          ExpectedMaskVal[i] = Mask[i+l] - l;
4213          continue;
4214        }
4215        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4216          return false;
4217      }
4218    }
4219  }
4220  return true;
4221}
4222
4223/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4224/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4225/// element of vector 2 and the other elements to come from vector 1 in order.
4226static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4227                               bool V2IsSplat = false, bool V2IsUndef = false) {
4228  if (!VT.is128BitVector())
4229    return false;
4230
4231  unsigned NumOps = VT.getVectorNumElements();
4232  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4233    return false;
4234
4235  if (!isUndefOrEqual(Mask[0], 0))
4236    return false;
4237
4238  for (unsigned i = 1; i != NumOps; ++i)
4239    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4240          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4241          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4242      return false;
4243
4244  return true;
4245}
4246
4247/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4248/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4249/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4250static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4251                           const X86Subtarget *Subtarget) {
4252  if (!Subtarget->hasSSE3())
4253    return false;
4254
4255  unsigned NumElems = VT.getVectorNumElements();
4256
4257  if ((VT.is128BitVector() && NumElems != 4) ||
4258      (VT.is256BitVector() && NumElems != 8) ||
4259      (VT.is512BitVector() && NumElems != 16))
4260    return false;
4261
4262  // "i+1" is the value the indexed mask element must have
4263  for (unsigned i = 0; i != NumElems; i += 2)
4264    if (!isUndefOrEqual(Mask[i], i+1) ||
4265        !isUndefOrEqual(Mask[i+1], i+1))
4266      return false;
4267
4268  return true;
4269}
4270
4271/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4272/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4273/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4274static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4275                           const X86Subtarget *Subtarget) {
4276  if (!Subtarget->hasSSE3())
4277    return false;
4278
4279  unsigned NumElems = VT.getVectorNumElements();
4280
4281  if ((VT.is128BitVector() && NumElems != 4) ||
4282      (VT.is256BitVector() && NumElems != 8) ||
4283      (VT.is512BitVector() && NumElems != 16))
4284    return false;
4285
4286  // "i" is the value the indexed mask element must have
4287  for (unsigned i = 0; i != NumElems; i += 2)
4288    if (!isUndefOrEqual(Mask[i], i) ||
4289        !isUndefOrEqual(Mask[i+1], i))
4290      return false;
4291
4292  return true;
4293}
4294
4295/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4296/// specifies a shuffle of elements that is suitable for input to 256-bit
4297/// version of MOVDDUP.
4298static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4299  if (!HasFp256 || !VT.is256BitVector())
4300    return false;
4301
4302  unsigned NumElts = VT.getVectorNumElements();
4303  if (NumElts != 4)
4304    return false;
4305
4306  for (unsigned i = 0; i != NumElts/2; ++i)
4307    if (!isUndefOrEqual(Mask[i], 0))
4308      return false;
4309  for (unsigned i = NumElts/2; i != NumElts; ++i)
4310    if (!isUndefOrEqual(Mask[i], NumElts/2))
4311      return false;
4312  return true;
4313}
4314
4315/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4316/// specifies a shuffle of elements that is suitable for input to 128-bit
4317/// version of MOVDDUP.
4318static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4319  if (!VT.is128BitVector())
4320    return false;
4321
4322  unsigned e = VT.getVectorNumElements() / 2;
4323  for (unsigned i = 0; i != e; ++i)
4324    if (!isUndefOrEqual(Mask[i], i))
4325      return false;
4326  for (unsigned i = 0; i != e; ++i)
4327    if (!isUndefOrEqual(Mask[e+i], i))
4328      return false;
4329  return true;
4330}
4331
4332/// isVEXTRACTIndex - Return true if the specified
4333/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4334/// suitable for instruction that extract 128 or 256 bit vectors
4335static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4336  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4337  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4338    return false;
4339
4340  // The index should be aligned on a vecWidth-bit boundary.
4341  uint64_t Index =
4342    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4343
4344  MVT VT = N->getSimpleValueType(0);
4345  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4346  bool Result = (Index * ElSize) % vecWidth == 0;
4347
4348  return Result;
4349}
4350
4351/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4352/// operand specifies a subvector insert that is suitable for input to
4353/// insertion of 128 or 256-bit subvectors
4354static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4355  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4356  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4357    return false;
4358  // The index should be aligned on a vecWidth-bit boundary.
4359  uint64_t Index =
4360    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4361
4362  MVT VT = N->getSimpleValueType(0);
4363  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4364  bool Result = (Index * ElSize) % vecWidth == 0;
4365
4366  return Result;
4367}
4368
4369bool X86::isVINSERT128Index(SDNode *N) {
4370  return isVINSERTIndex(N, 128);
4371}
4372
4373bool X86::isVINSERT256Index(SDNode *N) {
4374  return isVINSERTIndex(N, 256);
4375}
4376
4377bool X86::isVEXTRACT128Index(SDNode *N) {
4378  return isVEXTRACTIndex(N, 128);
4379}
4380
4381bool X86::isVEXTRACT256Index(SDNode *N) {
4382  return isVEXTRACTIndex(N, 256);
4383}
4384
4385/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4386/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4387/// Handles 128-bit and 256-bit.
4388static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4389  MVT VT = N->getSimpleValueType(0);
4390
4391  assert((VT.getSizeInBits() >= 128) &&
4392         "Unsupported vector type for PSHUF/SHUFP");
4393
4394  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4395  // independently on 128-bit lanes.
4396  unsigned NumElts = VT.getVectorNumElements();
4397  unsigned NumLanes = VT.getSizeInBits()/128;
4398  unsigned NumLaneElts = NumElts/NumLanes;
4399
4400  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4401         "Only supports 2, 4 or 8 elements per lane");
4402
4403  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4404  unsigned Mask = 0;
4405  for (unsigned i = 0; i != NumElts; ++i) {
4406    int Elt = N->getMaskElt(i);
4407    if (Elt < 0) continue;
4408    Elt &= NumLaneElts - 1;
4409    unsigned ShAmt = (i << Shift) % 8;
4410    Mask |= Elt << ShAmt;
4411  }
4412
4413  return Mask;
4414}
4415
4416/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4417/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4418static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4419  MVT VT = N->getSimpleValueType(0);
4420
4421  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4422         "Unsupported vector type for PSHUFHW");
4423
4424  unsigned NumElts = VT.getVectorNumElements();
4425
4426  unsigned Mask = 0;
4427  for (unsigned l = 0; l != NumElts; l += 8) {
4428    // 8 nodes per lane, but we only care about the last 4.
4429    for (unsigned i = 0; i < 4; ++i) {
4430      int Elt = N->getMaskElt(l+i+4);
4431      if (Elt < 0) continue;
4432      Elt &= 0x3; // only 2-bits.
4433      Mask |= Elt << (i * 2);
4434    }
4435  }
4436
4437  return Mask;
4438}
4439
4440/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4441/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4442static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4443  MVT VT = N->getSimpleValueType(0);
4444
4445  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4446         "Unsupported vector type for PSHUFHW");
4447
4448  unsigned NumElts = VT.getVectorNumElements();
4449
4450  unsigned Mask = 0;
4451  for (unsigned l = 0; l != NumElts; l += 8) {
4452    // 8 nodes per lane, but we only care about the first 4.
4453    for (unsigned i = 0; i < 4; ++i) {
4454      int Elt = N->getMaskElt(l+i);
4455      if (Elt < 0) continue;
4456      Elt &= 0x3; // only 2-bits
4457      Mask |= Elt << (i * 2);
4458    }
4459  }
4460
4461  return Mask;
4462}
4463
4464/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
4465/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
4466static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4467  MVT VT = SVOp->getSimpleValueType(0);
4468  unsigned EltSize = VT.is512BitVector() ? 1 :
4469    VT.getVectorElementType().getSizeInBits() >> 3;
4470
4471  unsigned NumElts = VT.getVectorNumElements();
4472  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4473  unsigned NumLaneElts = NumElts/NumLanes;
4474
4475  int Val = 0;
4476  unsigned i;
4477  for (i = 0; i != NumElts; ++i) {
4478    Val = SVOp->getMaskElt(i);
4479    if (Val >= 0)
4480      break;
4481  }
4482  if (Val >= (int)NumElts)
4483    Val -= NumElts - NumLaneElts;
4484
4485  assert(Val - i > 0 && "PALIGNR imm should be positive");
4486  return (Val - i) * EltSize;
4487}
4488
4489static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4490  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4491  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4492    llvm_unreachable("Illegal extract subvector for VEXTRACT");
4493
4494  uint64_t Index =
4495    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4496
4497  MVT VecVT = N->getOperand(0).getSimpleValueType();
4498  MVT ElVT = VecVT.getVectorElementType();
4499
4500  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4501  return Index / NumElemsPerChunk;
4502}
4503
4504static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4505  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4506  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4507    llvm_unreachable("Illegal insert subvector for VINSERT");
4508
4509  uint64_t Index =
4510    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4511
4512  MVT VecVT = N->getSimpleValueType(0);
4513  MVT ElVT = VecVT.getVectorElementType();
4514
4515  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4516  return Index / NumElemsPerChunk;
4517}
4518
4519/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4520/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4521/// and VINSERTI128 instructions.
4522unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4523  return getExtractVEXTRACTImmediate(N, 128);
4524}
4525
4526/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4527/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4528/// and VINSERTI64x4 instructions.
4529unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4530  return getExtractVEXTRACTImmediate(N, 256);
4531}
4532
4533/// getInsertVINSERT128Immediate - Return the appropriate immediate
4534/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4535/// and VINSERTI128 instructions.
4536unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4537  return getInsertVINSERTImmediate(N, 128);
4538}
4539
4540/// getInsertVINSERT256Immediate - Return the appropriate immediate
4541/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
4542/// and VINSERTI64x4 instructions.
4543unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4544  return getInsertVINSERTImmediate(N, 256);
4545}
4546
4547/// isZeroNode - Returns true if Elt is a constant zero or a floating point
4548/// constant +0.0.
4549bool X86::isZeroNode(SDValue Elt) {
4550  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
4551    return CN->isNullValue();
4552  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
4553    return CFP->getValueAPF().isPosZero();
4554  return false;
4555}
4556
4557/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4558/// their permute mask.
4559static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
4560                                    SelectionDAG &DAG) {
4561  MVT VT = SVOp->getSimpleValueType(0);
4562  unsigned NumElems = VT.getVectorNumElements();
4563  SmallVector<int, 8> MaskVec;
4564
4565  for (unsigned i = 0; i != NumElems; ++i) {
4566    int Idx = SVOp->getMaskElt(i);
4567    if (Idx >= 0) {
4568      if (Idx < (int)NumElems)
4569        Idx += NumElems;
4570      else
4571        Idx -= NumElems;
4572    }
4573    MaskVec.push_back(Idx);
4574  }
4575  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
4576                              SVOp->getOperand(0), &MaskVec[0]);
4577}
4578
4579/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4580/// match movhlps. The lower half elements should come from upper half of
4581/// V1 (and in order), and the upper half elements should come from the upper
4582/// half of V2 (and in order).
4583static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
4584  if (!VT.is128BitVector())
4585    return false;
4586  if (VT.getVectorNumElements() != 4)
4587    return false;
4588  for (unsigned i = 0, e = 2; i != e; ++i)
4589    if (!isUndefOrEqual(Mask[i], i+2))
4590      return false;
4591  for (unsigned i = 2; i != 4; ++i)
4592    if (!isUndefOrEqual(Mask[i], i+4))
4593      return false;
4594  return true;
4595}
4596
4597/// isScalarLoadToVector - Returns true if the node is a scalar load that
4598/// is promoted to a vector. It also returns the LoadSDNode by reference if
4599/// required.
4600static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4601  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4602    return false;
4603  N = N->getOperand(0).getNode();
4604  if (!ISD::isNON_EXTLoad(N))
4605    return false;
4606  if (LD)
4607    *LD = cast<LoadSDNode>(N);
4608  return true;
4609}
4610
4611// Test whether the given value is a vector value which will be legalized
4612// into a load.
4613static bool WillBeConstantPoolLoad(SDNode *N) {
4614  if (N->getOpcode() != ISD::BUILD_VECTOR)
4615    return false;
4616
4617  // Check for any non-constant elements.
4618  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4619    switch (N->getOperand(i).getNode()->getOpcode()) {
4620    case ISD::UNDEF:
4621    case ISD::ConstantFP:
4622    case ISD::Constant:
4623      break;
4624    default:
4625      return false;
4626    }
4627
4628  // Vectors of all-zeros and all-ones are materialized with special
4629  // instructions rather than being loaded.
4630  return !ISD::isBuildVectorAllZeros(N) &&
4631         !ISD::isBuildVectorAllOnes(N);
4632}
4633
4634/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4635/// match movlp{s|d}. The lower half elements should come from lower half of
4636/// V1 (and in order), and the upper half elements should come from the upper
4637/// half of V2 (and in order). And since V1 will become the source of the
4638/// MOVLP, it must be either a vector load or a scalar load to vector.
4639static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
4640                               ArrayRef<int> Mask, MVT VT) {
4641  if (!VT.is128BitVector())
4642    return false;
4643
4644  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4645    return false;
4646  // Is V2 is a vector load, don't do this transformation. We will try to use
4647  // load folding shufps op.
4648  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
4649    return false;
4650
4651  unsigned NumElems = VT.getVectorNumElements();
4652
4653  if (NumElems != 2 && NumElems != 4)
4654    return false;
4655  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4656    if (!isUndefOrEqual(Mask[i], i))
4657      return false;
4658  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4659    if (!isUndefOrEqual(Mask[i], i+NumElems))
4660      return false;
4661  return true;
4662}
4663
4664/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4665/// all the same.
4666static bool isSplatVector(SDNode *N) {
4667  if (N->getOpcode() != ISD::BUILD_VECTOR)
4668    return false;
4669
4670  SDValue SplatValue = N->getOperand(0);
4671  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4672    if (N->getOperand(i) != SplatValue)
4673      return false;
4674  return true;
4675}
4676
4677/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4678/// to an zero vector.
4679/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4680static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4681  SDValue V1 = N->getOperand(0);
4682  SDValue V2 = N->getOperand(1);
4683  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4684  for (unsigned i = 0; i != NumElems; ++i) {
4685    int Idx = N->getMaskElt(i);
4686    if (Idx >= (int)NumElems) {
4687      unsigned Opc = V2.getOpcode();
4688      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4689        continue;
4690      if (Opc != ISD::BUILD_VECTOR ||
4691          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4692        return false;
4693    } else if (Idx >= 0) {
4694      unsigned Opc = V1.getOpcode();
4695      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4696        continue;
4697      if (Opc != ISD::BUILD_VECTOR ||
4698          !X86::isZeroNode(V1.getOperand(Idx)))
4699        return false;
4700    }
4701  }
4702  return true;
4703}
4704
4705/// getZeroVector - Returns a vector of specified type with all zero elements.
4706///
4707static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4708                             SelectionDAG &DAG, SDLoc dl) {
4709  assert(VT.isVector() && "Expected a vector type");
4710
4711  // Always build SSE zero vectors as <4 x i32> bitcasted
4712  // to their dest type. This ensures they get CSE'd.
4713  SDValue Vec;
4714  if (VT.is128BitVector()) {  // SSE
4715    if (Subtarget->hasSSE2()) {  // SSE2
4716      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4717      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4718    } else { // SSE1
4719      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4720      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4721    }
4722  } else if (VT.is256BitVector()) { // AVX
4723    if (Subtarget->hasInt256()) { // AVX2
4724      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4725      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4726      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
4727                        array_lengthof(Ops));
4728    } else {
4729      // 256-bit logic and arithmetic instructions in AVX are all
4730      // floating-point, no support for integer ops. Emit fp zeroed vectors.
4731      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4732      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4733      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
4734                        array_lengthof(Ops));
4735    }
4736  } else if (VT.is512BitVector()) { // AVX-512
4737      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4738      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
4739                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4740      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
4741  } else
4742    llvm_unreachable("Unexpected vector type");
4743
4744  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4745}
4746
4747/// getOnesVector - Returns a vector of specified type with all bits set.
4748/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4749/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4750/// Then bitcast to their original type, ensuring they get CSE'd.
4751static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
4752                             SDLoc dl) {
4753  assert(VT.isVector() && "Expected a vector type");
4754
4755  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4756  SDValue Vec;
4757  if (VT.is256BitVector()) {
4758    if (HasInt256) { // AVX2
4759      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4760      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
4761                        array_lengthof(Ops));
4762    } else { // AVX
4763      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4764      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4765    }
4766  } else if (VT.is128BitVector()) {
4767    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4768  } else
4769    llvm_unreachable("Unexpected vector type");
4770
4771  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4772}
4773
4774/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4775/// that point to V2 points to its first element.
4776static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4777  for (unsigned i = 0; i != NumElems; ++i) {
4778    if (Mask[i] > (int)NumElems) {
4779      Mask[i] = NumElems;
4780    }
4781  }
4782}
4783
4784/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4785/// operation of specified width.
4786static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
4787                       SDValue V2) {
4788  unsigned NumElems = VT.getVectorNumElements();
4789  SmallVector<int, 8> Mask;
4790  Mask.push_back(NumElems);
4791  for (unsigned i = 1; i != NumElems; ++i)
4792    Mask.push_back(i);
4793  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4794}
4795
4796/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4797static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
4798                          SDValue V2) {
4799  unsigned NumElems = VT.getVectorNumElements();
4800  SmallVector<int, 8> Mask;
4801  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4802    Mask.push_back(i);
4803    Mask.push_back(i + NumElems);
4804  }
4805  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4806}
4807
4808/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4809static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
4810                          SDValue V2) {
4811  unsigned NumElems = VT.getVectorNumElements();
4812  SmallVector<int, 8> Mask;
4813  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4814    Mask.push_back(i + Half);
4815    Mask.push_back(i + NumElems + Half);
4816  }
4817  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4818}
4819
4820// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4821// a generic shuffle instruction because the target has no such instructions.
4822// Generate shuffles which repeat i16 and i8 several times until they can be
4823// represented by v4f32 and then be manipulated by target suported shuffles.
4824static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4825  MVT VT = V.getSimpleValueType();
4826  int NumElems = VT.getVectorNumElements();
4827  SDLoc dl(V);
4828
4829  while (NumElems > 4) {
4830    if (EltNo < NumElems/2) {
4831      V = getUnpackl(DAG, dl, VT, V, V);
4832    } else {
4833      V = getUnpackh(DAG, dl, VT, V, V);
4834      EltNo -= NumElems/2;
4835    }
4836    NumElems >>= 1;
4837  }
4838  return V;
4839}
4840
4841/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4842static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4843  MVT VT = V.getSimpleValueType();
4844  SDLoc dl(V);
4845
4846  if (VT.is128BitVector()) {
4847    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4848    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4849    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4850                             &SplatMask[0]);
4851  } else if (VT.is256BitVector()) {
4852    // To use VPERMILPS to splat scalars, the second half of indicies must
4853    // refer to the higher part, which is a duplication of the lower one,
4854    // because VPERMILPS can only handle in-lane permutations.
4855    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4856                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4857
4858    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4859    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4860                             &SplatMask[0]);
4861  } else
4862    llvm_unreachable("Vector size not supported");
4863
4864  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4865}
4866
4867/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4868static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4869  MVT SrcVT = SV->getSimpleValueType(0);
4870  SDValue V1 = SV->getOperand(0);
4871  SDLoc dl(SV);
4872
4873  int EltNo = SV->getSplatIndex();
4874  int NumElems = SrcVT.getVectorNumElements();
4875  bool Is256BitVec = SrcVT.is256BitVector();
4876
4877  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
4878         "Unknown how to promote splat for type");
4879
4880  // Extract the 128-bit part containing the splat element and update
4881  // the splat element index when it refers to the higher register.
4882  if (Is256BitVec) {
4883    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4884    if (EltNo >= NumElems/2)
4885      EltNo -= NumElems/2;
4886  }
4887
4888  // All i16 and i8 vector types can't be used directly by a generic shuffle
4889  // instruction because the target has no such instruction. Generate shuffles
4890  // which repeat i16 and i8 several times until they fit in i32, and then can
4891  // be manipulated by target suported shuffles.
4892  MVT EltVT = SrcVT.getVectorElementType();
4893  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4894    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4895
4896  // Recreate the 256-bit vector and place the same 128-bit vector
4897  // into the low and high part. This is necessary because we want
4898  // to use VPERM* to shuffle the vectors
4899  if (Is256BitVec) {
4900    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4901  }
4902
4903  return getLegalSplat(DAG, V1, EltNo);
4904}
4905
4906/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4907/// vector of zero or undef vector.  This produces a shuffle where the low
4908/// element of V2 is swizzled into the zero/undef vector, landing at element
4909/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4910static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4911                                           bool IsZero,
4912                                           const X86Subtarget *Subtarget,
4913                                           SelectionDAG &DAG) {
4914  MVT VT = V2.getSimpleValueType();
4915  SDValue V1 = IsZero
4916    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4917  unsigned NumElems = VT.getVectorNumElements();
4918  SmallVector<int, 16> MaskVec;
4919  for (unsigned i = 0; i != NumElems; ++i)
4920    // If this is the insertion idx, put the low elt of V2 here.
4921    MaskVec.push_back(i == Idx ? NumElems : i);
4922  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
4923}
4924
4925/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4926/// target specific opcode. Returns true if the Mask could be calculated.
4927/// Sets IsUnary to true if only uses one source.
4928static bool getTargetShuffleMask(SDNode *N, MVT VT,
4929                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4930  unsigned NumElems = VT.getVectorNumElements();
4931  SDValue ImmN;
4932
4933  IsUnary = false;
4934  switch(N->getOpcode()) {
4935  case X86ISD::SHUFP:
4936    ImmN = N->getOperand(N->getNumOperands()-1);
4937    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4938    break;
4939  case X86ISD::UNPCKH:
4940    DecodeUNPCKHMask(VT, Mask);
4941    break;
4942  case X86ISD::UNPCKL:
4943    DecodeUNPCKLMask(VT, Mask);
4944    break;
4945  case X86ISD::MOVHLPS:
4946    DecodeMOVHLPSMask(NumElems, Mask);
4947    break;
4948  case X86ISD::MOVLHPS:
4949    DecodeMOVLHPSMask(NumElems, Mask);
4950    break;
4951  case X86ISD::PALIGNR:
4952    ImmN = N->getOperand(N->getNumOperands()-1);
4953    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4954    break;
4955  case X86ISD::PSHUFD:
4956  case X86ISD::VPERMILP:
4957    ImmN = N->getOperand(N->getNumOperands()-1);
4958    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4959    IsUnary = true;
4960    break;
4961  case X86ISD::PSHUFHW:
4962    ImmN = N->getOperand(N->getNumOperands()-1);
4963    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4964    IsUnary = true;
4965    break;
4966  case X86ISD::PSHUFLW:
4967    ImmN = N->getOperand(N->getNumOperands()-1);
4968    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4969    IsUnary = true;
4970    break;
4971  case X86ISD::VPERMI:
4972    ImmN = N->getOperand(N->getNumOperands()-1);
4973    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4974    IsUnary = true;
4975    break;
4976  case X86ISD::MOVSS:
4977  case X86ISD::MOVSD: {
4978    // The index 0 always comes from the first element of the second source,
4979    // this is why MOVSS and MOVSD are used in the first place. The other
4980    // elements come from the other positions of the first source vector
4981    Mask.push_back(NumElems);
4982    for (unsigned i = 1; i != NumElems; ++i) {
4983      Mask.push_back(i);
4984    }
4985    break;
4986  }
4987  case X86ISD::VPERM2X128:
4988    ImmN = N->getOperand(N->getNumOperands()-1);
4989    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4990    if (Mask.empty()) return false;
4991    break;
4992  case X86ISD::MOVDDUP:
4993  case X86ISD::MOVLHPD:
4994  case X86ISD::MOVLPD:
4995  case X86ISD::MOVLPS:
4996  case X86ISD::MOVSHDUP:
4997  case X86ISD::MOVSLDUP:
4998    // Not yet implemented
4999    return false;
5000  default: llvm_unreachable("unknown target shuffle node");
5001  }
5002
5003  return true;
5004}
5005
5006/// getShuffleScalarElt - Returns the scalar element that will make up the ith
5007/// element of the result of the vector shuffle.
5008static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5009                                   unsigned Depth) {
5010  if (Depth == 6)
5011    return SDValue();  // Limit search depth.
5012
5013  SDValue V = SDValue(N, 0);
5014  EVT VT = V.getValueType();
5015  unsigned Opcode = V.getOpcode();
5016
5017  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5018  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5019    int Elt = SV->getMaskElt(Index);
5020
5021    if (Elt < 0)
5022      return DAG.getUNDEF(VT.getVectorElementType());
5023
5024    unsigned NumElems = VT.getVectorNumElements();
5025    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5026                                         : SV->getOperand(1);
5027    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5028  }
5029
5030  // Recurse into target specific vector shuffles to find scalars.
5031  if (isTargetShuffle(Opcode)) {
5032    MVT ShufVT = V.getSimpleValueType();
5033    unsigned NumElems = ShufVT.getVectorNumElements();
5034    SmallVector<int, 16> ShuffleMask;
5035    bool IsUnary;
5036
5037    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5038      return SDValue();
5039
5040    int Elt = ShuffleMask[Index];
5041    if (Elt < 0)
5042      return DAG.getUNDEF(ShufVT.getVectorElementType());
5043
5044    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5045                                         : N->getOperand(1);
5046    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5047                               Depth+1);
5048  }
5049
5050  // Actual nodes that may contain scalar elements
5051  if (Opcode == ISD::BITCAST) {
5052    V = V.getOperand(0);
5053    EVT SrcVT = V.getValueType();
5054    unsigned NumElems = VT.getVectorNumElements();
5055
5056    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5057      return SDValue();
5058  }
5059
5060  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5061    return (Index == 0) ? V.getOperand(0)
5062                        : DAG.getUNDEF(VT.getVectorElementType());
5063
5064  if (V.getOpcode() == ISD::BUILD_VECTOR)
5065    return V.getOperand(Index);
5066
5067  return SDValue();
5068}
5069
5070/// getNumOfConsecutiveZeros - Return the number of elements of a vector
5071/// shuffle operation which come from a consecutively from a zero. The
5072/// search can start in two different directions, from left or right.
5073/// We count undefs as zeros until PreferredNum is reached.
5074static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5075                                         unsigned NumElems, bool ZerosFromLeft,
5076                                         SelectionDAG &DAG,
5077                                         unsigned PreferredNum = -1U) {
5078  unsigned NumZeros = 0;
5079  for (unsigned i = 0; i != NumElems; ++i) {
5080    unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5081    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5082    if (!Elt.getNode())
5083      break;
5084
5085    if (X86::isZeroNode(Elt))
5086      ++NumZeros;
5087    else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5088      NumZeros = std::min(NumZeros + 1, PreferredNum);
5089    else
5090      break;
5091  }
5092
5093  return NumZeros;
5094}
5095
5096/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5097/// correspond consecutively to elements from one of the vector operands,
5098/// starting from its index OpIdx. Also tell OpNum which source vector operand.
5099static
5100bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5101                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5102                              unsigned NumElems, unsigned &OpNum) {
5103  bool SeenV1 = false;
5104  bool SeenV2 = false;
5105
5106  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5107    int Idx = SVOp->getMaskElt(i);
5108    // Ignore undef indicies
5109    if (Idx < 0)
5110      continue;
5111
5112    if (Idx < (int)NumElems)
5113      SeenV1 = true;
5114    else
5115      SeenV2 = true;
5116
5117    // Only accept consecutive elements from the same vector
5118    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5119      return false;
5120  }
5121
5122  OpNum = SeenV1 ? 0 : 1;
5123  return true;
5124}
5125
5126/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5127/// logical left shift of a vector.
5128static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5129                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5130  unsigned NumElems =
5131    SVOp->getSimpleValueType(0).getVectorNumElements();
5132  unsigned NumZeros = getNumOfConsecutiveZeros(
5133      SVOp, NumElems, false /* check zeros from right */, DAG,
5134      SVOp->getMaskElt(0));
5135  unsigned OpSrc;
5136
5137  if (!NumZeros)
5138    return false;
5139
5140  // Considering the elements in the mask that are not consecutive zeros,
5141  // check if they consecutively come from only one of the source vectors.
5142  //
5143  //               V1 = {X, A, B, C}     0
5144  //                         \  \  \    /
5145  //   vector_shuffle V1, V2 <1, 2, 3, X>
5146  //
5147  if (!isShuffleMaskConsecutive(SVOp,
5148            0,                   // Mask Start Index
5149            NumElems-NumZeros,   // Mask End Index(exclusive)
5150            NumZeros,            // Where to start looking in the src vector
5151            NumElems,            // Number of elements in vector
5152            OpSrc))              // Which source operand ?
5153    return false;
5154
5155  isLeft = false;
5156  ShAmt = NumZeros;
5157  ShVal = SVOp->getOperand(OpSrc);
5158  return true;
5159}
5160
5161/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5162/// logical left shift of a vector.
5163static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5164                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5165  unsigned NumElems =
5166    SVOp->getSimpleValueType(0).getVectorNumElements();
5167  unsigned NumZeros = getNumOfConsecutiveZeros(
5168      SVOp, NumElems, true /* check zeros from left */, DAG,
5169      NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5170  unsigned OpSrc;
5171
5172  if (!NumZeros)
5173    return false;
5174
5175  // Considering the elements in the mask that are not consecutive zeros,
5176  // check if they consecutively come from only one of the source vectors.
5177  //
5178  //                           0    { A, B, X, X } = V2
5179  //                          / \    /  /
5180  //   vector_shuffle V1, V2 <X, X, 4, 5>
5181  //
5182  if (!isShuffleMaskConsecutive(SVOp,
5183            NumZeros,     // Mask Start Index
5184            NumElems,     // Mask End Index(exclusive)
5185            0,            // Where to start looking in the src vector
5186            NumElems,     // Number of elements in vector
5187            OpSrc))       // Which source operand ?
5188    return false;
5189
5190  isLeft = true;
5191  ShAmt = NumZeros;
5192  ShVal = SVOp->getOperand(OpSrc);
5193  return true;
5194}
5195
5196/// isVectorShift - Returns true if the shuffle can be implemented as a
5197/// logical left or right shift of a vector.
5198static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5199                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5200  // Although the logic below support any bitwidth size, there are no
5201  // shift instructions which handle more than 128-bit vectors.
5202  if (!SVOp->getSimpleValueType(0).is128BitVector())
5203    return false;
5204
5205  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5206      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5207    return true;
5208
5209  return false;
5210}
5211
5212/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5213///
5214static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5215                                       unsigned NumNonZero, unsigned NumZero,
5216                                       SelectionDAG &DAG,
5217                                       const X86Subtarget* Subtarget,
5218                                       const TargetLowering &TLI) {
5219  if (NumNonZero > 8)
5220    return SDValue();
5221
5222  SDLoc dl(Op);
5223  SDValue V(0, 0);
5224  bool First = true;
5225  for (unsigned i = 0; i < 16; ++i) {
5226    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5227    if (ThisIsNonZero && First) {
5228      if (NumZero)
5229        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5230      else
5231        V = DAG.getUNDEF(MVT::v8i16);
5232      First = false;
5233    }
5234
5235    if ((i & 1) != 0) {
5236      SDValue ThisElt(0, 0), LastElt(0, 0);
5237      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5238      if (LastIsNonZero) {
5239        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5240                              MVT::i16, Op.getOperand(i-1));
5241      }
5242      if (ThisIsNonZero) {
5243        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5244        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5245                              ThisElt, DAG.getConstant(8, MVT::i8));
5246        if (LastIsNonZero)
5247          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5248      } else
5249        ThisElt = LastElt;
5250
5251      if (ThisElt.getNode())
5252        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5253                        DAG.getIntPtrConstant(i/2));
5254    }
5255  }
5256
5257  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5258}
5259
5260/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5261///
5262static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5263                                     unsigned NumNonZero, unsigned NumZero,
5264                                     SelectionDAG &DAG,
5265                                     const X86Subtarget* Subtarget,
5266                                     const TargetLowering &TLI) {
5267  if (NumNonZero > 4)
5268    return SDValue();
5269
5270  SDLoc dl(Op);
5271  SDValue V(0, 0);
5272  bool First = true;
5273  for (unsigned i = 0; i < 8; ++i) {
5274    bool isNonZero = (NonZeros & (1 << i)) != 0;
5275    if (isNonZero) {
5276      if (First) {
5277        if (NumZero)
5278          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5279        else
5280          V = DAG.getUNDEF(MVT::v8i16);
5281        First = false;
5282      }
5283      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5284                      MVT::v8i16, V, Op.getOperand(i),
5285                      DAG.getIntPtrConstant(i));
5286    }
5287  }
5288
5289  return V;
5290}
5291
5292/// getVShift - Return a vector logical shift node.
5293///
5294static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5295                         unsigned NumBits, SelectionDAG &DAG,
5296                         const TargetLowering &TLI, SDLoc dl) {
5297  assert(VT.is128BitVector() && "Unknown type for VShift");
5298  EVT ShVT = MVT::v2i64;
5299  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5300  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5301  return DAG.getNode(ISD::BITCAST, dl, VT,
5302                     DAG.getNode(Opc, dl, ShVT, SrcOp,
5303                             DAG.getConstant(NumBits,
5304                                  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
5305}
5306
5307static SDValue
5308LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5309
5310  // Check if the scalar load can be widened into a vector load. And if
5311  // the address is "base + cst" see if the cst can be "absorbed" into
5312  // the shuffle mask.
5313  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5314    SDValue Ptr = LD->getBasePtr();
5315    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5316      return SDValue();
5317    EVT PVT = LD->getValueType(0);
5318    if (PVT != MVT::i32 && PVT != MVT::f32)
5319      return SDValue();
5320
5321    int FI = -1;
5322    int64_t Offset = 0;
5323    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5324      FI = FINode->getIndex();
5325      Offset = 0;
5326    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5327               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5328      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5329      Offset = Ptr.getConstantOperandVal(1);
5330      Ptr = Ptr.getOperand(0);
5331    } else {
5332      return SDValue();
5333    }
5334
5335    // FIXME: 256-bit vector instructions don't require a strict alignment,
5336    // improve this code to support it better.
5337    unsigned RequiredAlign = VT.getSizeInBits()/8;
5338    SDValue Chain = LD->getChain();
5339    // Make sure the stack object alignment is at least 16 or 32.
5340    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5341    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5342      if (MFI->isFixedObjectIndex(FI)) {
5343        // Can't change the alignment. FIXME: It's possible to compute
5344        // the exact stack offset and reference FI + adjust offset instead.
5345        // If someone *really* cares about this. That's the way to implement it.
5346        return SDValue();
5347      } else {
5348        MFI->setObjectAlignment(FI, RequiredAlign);
5349      }
5350    }
5351
5352    // (Offset % 16 or 32) must be multiple of 4. Then address is then
5353    // Ptr + (Offset & ~15).
5354    if (Offset < 0)
5355      return SDValue();
5356    if ((Offset % RequiredAlign) & 3)
5357      return SDValue();
5358    int64_t StartOffset = Offset & ~(RequiredAlign-1);
5359    if (StartOffset)
5360      Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5361                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5362
5363    int EltNo = (Offset - StartOffset) >> 2;
5364    unsigned NumElems = VT.getVectorNumElements();
5365
5366    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5367    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5368                             LD->getPointerInfo().getWithOffset(StartOffset),
5369                             false, false, false, 0);
5370
5371    SmallVector<int, 8> Mask;
5372    for (unsigned i = 0; i != NumElems; ++i)
5373      Mask.push_back(EltNo);
5374
5375    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
5376  }
5377
5378  return SDValue();
5379}
5380
5381/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
5382/// vector of type 'VT', see if the elements can be replaced by a single large
5383/// load which has the same value as a build_vector whose operands are 'elts'.
5384///
5385/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
5386///
5387/// FIXME: we'd also like to handle the case where the last elements are zero
5388/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
5389/// There's even a handy isZeroNode for that purpose.
5390static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
5391                                        SDLoc &DL, SelectionDAG &DAG) {
5392  EVT EltVT = VT.getVectorElementType();
5393  unsigned NumElems = Elts.size();
5394
5395  LoadSDNode *LDBase = NULL;
5396  unsigned LastLoadedElt = -1U;
5397
5398  // For each element in the initializer, see if we've found a load or an undef.
5399  // If we don't find an initial load element, or later load elements are
5400  // non-consecutive, bail out.
5401  for (unsigned i = 0; i < NumElems; ++i) {
5402    SDValue Elt = Elts[i];
5403
5404    if (!Elt.getNode() ||
5405        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
5406      return SDValue();
5407    if (!LDBase) {
5408      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
5409        return SDValue();
5410      LDBase = cast<LoadSDNode>(Elt.getNode());
5411      LastLoadedElt = i;
5412      continue;
5413    }
5414    if (Elt.getOpcode() == ISD::UNDEF)
5415      continue;
5416
5417    LoadSDNode *LD = cast<LoadSDNode>(Elt);
5418    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
5419      return SDValue();
5420    LastLoadedElt = i;
5421  }
5422
5423  // If we have found an entire vector of loads and undefs, then return a large
5424  // load of the entire vector width starting at the base pointer.  If we found
5425  // consecutive loads for the low half, generate a vzext_load node.
5426  if (LastLoadedElt == NumElems - 1) {
5427    SDValue NewLd = SDValue();
5428    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
5429      NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5430                          LDBase->getPointerInfo(),
5431                          LDBase->isVolatile(), LDBase->isNonTemporal(),
5432                          LDBase->isInvariant(), 0);
5433    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5434                        LDBase->getPointerInfo(),
5435                        LDBase->isVolatile(), LDBase->isNonTemporal(),
5436                        LDBase->isInvariant(), LDBase->getAlignment());
5437
5438    if (LDBase->hasAnyUseOfValue(1)) {
5439      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5440                                     SDValue(LDBase, 1),
5441                                     SDValue(NewLd.getNode(), 1));
5442      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5443      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5444                             SDValue(NewLd.getNode(), 1));
5445    }
5446
5447    return NewLd;
5448  }
5449  if (NumElems == 4 && LastLoadedElt == 1 &&
5450      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
5451    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
5452    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5453    SDValue ResNode =
5454        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
5455                                array_lengthof(Ops), MVT::i64,
5456                                LDBase->getPointerInfo(),
5457                                LDBase->getAlignment(),
5458                                false/*isVolatile*/, true/*ReadMem*/,
5459                                false/*WriteMem*/);
5460
5461    // Make sure the newly-created LOAD is in the same position as LDBase in
5462    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
5463    // update uses of LDBase's output chain to use the TokenFactor.
5464    if (LDBase->hasAnyUseOfValue(1)) {
5465      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5466                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
5467      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5468      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5469                             SDValue(ResNode.getNode(), 1));
5470    }
5471
5472    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
5473  }
5474  return SDValue();
5475}
5476
5477/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
5478/// to generate a splat value for the following cases:
5479/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5480/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5481/// a scalar load, or a constant.
5482/// The VBROADCAST node is returned when a pattern is found,
5483/// or SDValue() otherwise.
5484static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
5485                                    SelectionDAG &DAG) {
5486  if (!Subtarget->hasFp256())
5487    return SDValue();
5488
5489  MVT VT = Op.getSimpleValueType();
5490  SDLoc dl(Op);
5491
5492  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5493         "Unsupported vector type for broadcast.");
5494
5495  SDValue Ld;
5496  bool ConstSplatVal;
5497
5498  switch (Op.getOpcode()) {
5499    default:
5500      // Unknown pattern found.
5501      return SDValue();
5502
5503    case ISD::BUILD_VECTOR: {
5504      // The BUILD_VECTOR node must be a splat.
5505      if (!isSplatVector(Op.getNode()))
5506        return SDValue();
5507
5508      Ld = Op.getOperand(0);
5509      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5510                     Ld.getOpcode() == ISD::ConstantFP);
5511
5512      // The suspected load node has several users. Make sure that all
5513      // of its users are from the BUILD_VECTOR node.
5514      // Constants may have multiple users.
5515      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
5516        return SDValue();
5517      break;
5518    }
5519
5520    case ISD::VECTOR_SHUFFLE: {
5521      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5522
5523      // Shuffles must have a splat mask where the first element is
5524      // broadcasted.
5525      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5526        return SDValue();
5527
5528      SDValue Sc = Op.getOperand(0);
5529      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5530          Sc.getOpcode() != ISD::BUILD_VECTOR) {
5531
5532        if (!Subtarget->hasInt256())
5533          return SDValue();
5534
5535        // Use the register form of the broadcast instruction available on AVX2.
5536        if (VT.getSizeInBits() >= 256)
5537          Sc = Extract128BitVector(Sc, 0, DAG, dl);
5538        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5539      }
5540
5541      Ld = Sc.getOperand(0);
5542      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5543                       Ld.getOpcode() == ISD::ConstantFP);
5544
5545      // The scalar_to_vector node and the suspected
5546      // load node must have exactly one user.
5547      // Constants may have multiple users.
5548
5549      // AVX-512 has register version of the broadcast
5550      bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
5551        Ld.getValueType().getSizeInBits() >= 32;
5552      if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5553          !hasRegVer))
5554        return SDValue();
5555      break;
5556    }
5557  }
5558
5559  bool IsGE256 = (VT.getSizeInBits() >= 256);
5560
5561  // Handle the broadcasting a single constant scalar from the constant pool
5562  // into a vector. On Sandybridge it is still better to load a constant vector
5563  // from the constant pool and not to broadcast it from a scalar.
5564  if (ConstSplatVal && Subtarget->hasInt256()) {
5565    EVT CVT = Ld.getValueType();
5566    assert(!CVT.isVector() && "Must not broadcast a vector type");
5567    unsigned ScalarSize = CVT.getSizeInBits();
5568
5569    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
5570      const Constant *C = 0;
5571      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5572        C = CI->getConstantIntValue();
5573      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5574        C = CF->getConstantFPValue();
5575
5576      assert(C && "Invalid constant type");
5577
5578      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5579      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
5580      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5581      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5582                       MachinePointerInfo::getConstantPool(),
5583                       false, false, false, Alignment);
5584
5585      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5586    }
5587  }
5588
5589  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5590  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5591
5592  // Handle AVX2 in-register broadcasts.
5593  if (!IsLoad && Subtarget->hasInt256() &&
5594      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5595    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5596
5597  // The scalar source must be a normal load.
5598  if (!IsLoad)
5599    return SDValue();
5600
5601  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
5602    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5603
5604  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5605  // double since there is no vbroadcastsd xmm
5606  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
5607    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5608      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5609  }
5610
5611  // Unsupported broadcast.
5612  return SDValue();
5613}
5614
5615static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5616  MVT VT = Op.getSimpleValueType();
5617
5618  // Skip if insert_vec_elt is not supported.
5619  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5620  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5621    return SDValue();
5622
5623  SDLoc DL(Op);
5624  unsigned NumElems = Op.getNumOperands();
5625
5626  SDValue VecIn1;
5627  SDValue VecIn2;
5628  SmallVector<unsigned, 4> InsertIndices;
5629  SmallVector<int, 8> Mask(NumElems, -1);
5630
5631  for (unsigned i = 0; i != NumElems; ++i) {
5632    unsigned Opc = Op.getOperand(i).getOpcode();
5633
5634    if (Opc == ISD::UNDEF)
5635      continue;
5636
5637    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5638      // Quit if more than 1 elements need inserting.
5639      if (InsertIndices.size() > 1)
5640        return SDValue();
5641
5642      InsertIndices.push_back(i);
5643      continue;
5644    }
5645
5646    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5647    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5648
5649    // Quit if extracted from vector of different type.
5650    if (ExtractedFromVec.getValueType() != VT)
5651      return SDValue();
5652
5653    // Quit if non-constant index.
5654    if (!isa<ConstantSDNode>(ExtIdx))
5655      return SDValue();
5656
5657    if (VecIn1.getNode() == 0)
5658      VecIn1 = ExtractedFromVec;
5659    else if (VecIn1 != ExtractedFromVec) {
5660      if (VecIn2.getNode() == 0)
5661        VecIn2 = ExtractedFromVec;
5662      else if (VecIn2 != ExtractedFromVec)
5663        // Quit if more than 2 vectors to shuffle
5664        return SDValue();
5665    }
5666
5667    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5668
5669    if (ExtractedFromVec == VecIn1)
5670      Mask[i] = Idx;
5671    else if (ExtractedFromVec == VecIn2)
5672      Mask[i] = Idx + NumElems;
5673  }
5674
5675  if (VecIn1.getNode() == 0)
5676    return SDValue();
5677
5678  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5679  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
5680  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5681    unsigned Idx = InsertIndices[i];
5682    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
5683                     DAG.getIntPtrConstant(Idx));
5684  }
5685
5686  return NV;
5687}
5688
5689// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
5690SDValue
5691X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
5692
5693  MVT VT = Op.getSimpleValueType();
5694  assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
5695         "Unexpected type in LowerBUILD_VECTORvXi1!");
5696
5697  SDLoc dl(Op);
5698  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5699    SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
5700    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5701                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5702    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
5703                       Ops, VT.getVectorNumElements());
5704  }
5705
5706  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5707    SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
5708    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5709                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5710    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
5711                       Ops, VT.getVectorNumElements());
5712  }
5713
5714  bool AllContants = true;
5715  uint64_t Immediate = 0;
5716  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
5717    SDValue In = Op.getOperand(idx);
5718    if (In.getOpcode() == ISD::UNDEF)
5719      continue;
5720    if (!isa<ConstantSDNode>(In)) {
5721      AllContants = false;
5722      break;
5723    }
5724    if (cast<ConstantSDNode>(In)->getZExtValue())
5725      Immediate |= (1ULL << idx);
5726  }
5727
5728  if (AllContants) {
5729    SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
5730      DAG.getConstant(Immediate, MVT::i16));
5731    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
5732                       DAG.getIntPtrConstant(0));
5733  }
5734
5735  // Splat vector (with undefs)
5736  SDValue In = Op.getOperand(0);
5737  for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) {
5738    if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF)
5739      llvm_unreachable("Unsupported predicate operation");
5740  }
5741
5742  SDValue EFLAGS, X86CC;
5743  if (In.getOpcode() == ISD::SETCC) {
5744    SDValue Op0 = In.getOperand(0);
5745    SDValue Op1 = In.getOperand(1);
5746    ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
5747    bool isFP = Op1.getValueType().isFloatingPoint();
5748    unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5749
5750    assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
5751
5752    X86CC = DAG.getConstant(X86CCVal, MVT::i8);
5753    EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
5754    EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
5755  } else if (In.getOpcode() == X86ISD::SETCC) {
5756    X86CC = In.getOperand(0);
5757    EFLAGS = In.getOperand(1);
5758  } else {
5759    // The algorithm:
5760    //   Bit1 = In & 0x1
5761    //   if (Bit1 != 0)
5762    //     ZF = 0
5763    //   else
5764    //     ZF = 1
5765    //   if (ZF == 0)
5766    //     res = allOnes ### CMOVNE -1, %res
5767    //   else
5768    //     res = allZero
5769    MVT InVT = In.getSimpleValueType();
5770    SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
5771    EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
5772    X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5773  }
5774
5775  if (VT == MVT::v16i1) {
5776    SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
5777    SDValue Cst0 = DAG.getConstant(0, MVT::i16);
5778    SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
5779          Cst0, Cst1, X86CC, EFLAGS);
5780    return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
5781  }
5782
5783  if (VT == MVT::v8i1) {
5784    SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
5785    SDValue Cst0 = DAG.getConstant(0, MVT::i32);
5786    SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
5787          Cst0, Cst1, X86CC, EFLAGS);
5788    CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
5789    return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
5790  }
5791  llvm_unreachable("Unsupported predicate operation");
5792}
5793
5794SDValue
5795X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5796  SDLoc dl(Op);
5797
5798  MVT VT = Op.getSimpleValueType();
5799  MVT ExtVT = VT.getVectorElementType();
5800  unsigned NumElems = Op.getNumOperands();
5801
5802  // Generate vectors for predicate vectors.
5803  if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
5804    return LowerBUILD_VECTORvXi1(Op, DAG);
5805
5806  // Vectors containing all zeros can be matched by pxor and xorps later
5807  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5808    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5809    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5810    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
5811      return Op;
5812
5813    return getZeroVector(VT, Subtarget, DAG, dl);
5814  }
5815
5816  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5817  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5818  // vpcmpeqd on 256-bit vectors.
5819  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
5820    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
5821      return Op;
5822
5823    if (!VT.is512BitVector())
5824      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
5825  }
5826
5827  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
5828  if (Broadcast.getNode())
5829    return Broadcast;
5830
5831  unsigned EVTBits = ExtVT.getSizeInBits();
5832
5833  unsigned NumZero  = 0;
5834  unsigned NumNonZero = 0;
5835  unsigned NonZeros = 0;
5836  bool IsAllConstants = true;
5837  SmallSet<SDValue, 8> Values;
5838  for (unsigned i = 0; i < NumElems; ++i) {
5839    SDValue Elt = Op.getOperand(i);
5840    if (Elt.getOpcode() == ISD::UNDEF)
5841      continue;
5842    Values.insert(Elt);
5843    if (Elt.getOpcode() != ISD::Constant &&
5844        Elt.getOpcode() != ISD::ConstantFP)
5845      IsAllConstants = false;
5846    if (X86::isZeroNode(Elt))
5847      NumZero++;
5848    else {
5849      NonZeros |= (1 << i);
5850      NumNonZero++;
5851    }
5852  }
5853
5854  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5855  if (NumNonZero == 0)
5856    return DAG.getUNDEF(VT);
5857
5858  // Special case for single non-zero, non-undef, element.
5859  if (NumNonZero == 1) {
5860    unsigned Idx = countTrailingZeros(NonZeros);
5861    SDValue Item = Op.getOperand(Idx);
5862
5863    // If this is an insertion of an i64 value on x86-32, and if the top bits of
5864    // the value are obviously zero, truncate the value to i32 and do the
5865    // insertion that way.  Only do this if the value is non-constant or if the
5866    // value is a constant being inserted into element 0.  It is cheaper to do
5867    // a constant pool load than it is to do a movd + shuffle.
5868    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5869        (!IsAllConstants || Idx == 0)) {
5870      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5871        // Handle SSE only.
5872        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5873        EVT VecVT = MVT::v4i32;
5874        unsigned VecElts = 4;
5875
5876        // Truncate the value (which may itself be a constant) to i32, and
5877        // convert it to a vector with movd (S2V+shuffle to zero extend).
5878        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5879        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5880        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5881
5882        // Now we have our 32-bit value zero extended in the low element of
5883        // a vector.  If Idx != 0, swizzle it into place.
5884        if (Idx != 0) {
5885          SmallVector<int, 4> Mask;
5886          Mask.push_back(Idx);
5887          for (unsigned i = 1; i != VecElts; ++i)
5888            Mask.push_back(i);
5889          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5890                                      &Mask[0]);
5891        }
5892        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5893      }
5894    }
5895
5896    // If we have a constant or non-constant insertion into the low element of
5897    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5898    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5899    // depending on what the source datatype is.
5900    if (Idx == 0) {
5901      if (NumZero == 0)
5902        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5903
5904      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5905          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5906        if (VT.is256BitVector() || VT.is512BitVector()) {
5907          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5908          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5909                             Item, DAG.getIntPtrConstant(0));
5910        }
5911        assert(VT.is128BitVector() && "Expected an SSE value type!");
5912        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5913        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5914        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5915      }
5916
5917      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5918        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5919        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5920        if (VT.is256BitVector()) {
5921          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5922          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5923        } else {
5924          assert(VT.is128BitVector() && "Expected an SSE value type!");
5925          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5926        }
5927        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5928      }
5929    }
5930
5931    // Is it a vector logical left shift?
5932    if (NumElems == 2 && Idx == 1 &&
5933        X86::isZeroNode(Op.getOperand(0)) &&
5934        !X86::isZeroNode(Op.getOperand(1))) {
5935      unsigned NumBits = VT.getSizeInBits();
5936      return getVShift(true, VT,
5937                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5938                                   VT, Op.getOperand(1)),
5939                       NumBits/2, DAG, *this, dl);
5940    }
5941
5942    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5943      return SDValue();
5944
5945    // Otherwise, if this is a vector with i32 or f32 elements, and the element
5946    // is a non-constant being inserted into an element other than the low one,
5947    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5948    // movd/movss) to move this into the low element, then shuffle it into
5949    // place.
5950    if (EVTBits == 32) {
5951      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5952
5953      // Turn it into a shuffle of zero and zero-extended scalar to vector.
5954      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5955      SmallVector<int, 8> MaskVec;
5956      for (unsigned i = 0; i != NumElems; ++i)
5957        MaskVec.push_back(i == Idx ? 0 : 1);
5958      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5959    }
5960  }
5961
5962  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5963  if (Values.size() == 1) {
5964    if (EVTBits == 32) {
5965      // Instead of a shuffle like this:
5966      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5967      // Check if it's possible to issue this instead.
5968      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5969      unsigned Idx = countTrailingZeros(NonZeros);
5970      SDValue Item = Op.getOperand(Idx);
5971      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5972        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5973    }
5974    return SDValue();
5975  }
5976
5977  // A vector full of immediates; various special cases are already
5978  // handled, so this is best done with a single constant-pool load.
5979  if (IsAllConstants)
5980    return SDValue();
5981
5982  // For AVX-length vectors, build the individual 128-bit pieces and use
5983  // shuffles to put them in place.
5984  if (VT.is256BitVector()) {
5985    SmallVector<SDValue, 32> V;
5986    for (unsigned i = 0; i != NumElems; ++i)
5987      V.push_back(Op.getOperand(i));
5988
5989    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5990
5991    // Build both the lower and upper subvector.
5992    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
5993    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
5994                                NumElems/2);
5995
5996    // Recreate the wider vector with the lower and upper part.
5997    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5998  }
5999
6000  // Let legalizer expand 2-wide build_vectors.
6001  if (EVTBits == 64) {
6002    if (NumNonZero == 1) {
6003      // One half is zero or undef.
6004      unsigned Idx = countTrailingZeros(NonZeros);
6005      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6006                                 Op.getOperand(Idx));
6007      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6008    }
6009    return SDValue();
6010  }
6011
6012  // If element VT is < 32 bits, convert it to inserts into a zero vector.
6013  if (EVTBits == 8 && NumElems == 16) {
6014    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
6015                                        Subtarget, *this);
6016    if (V.getNode()) return V;
6017  }
6018
6019  if (EVTBits == 16 && NumElems == 8) {
6020    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
6021                                      Subtarget, *this);
6022    if (V.getNode()) return V;
6023  }
6024
6025  // If element VT is == 32 bits, turn it into a number of shuffles.
6026  SmallVector<SDValue, 8> V(NumElems);
6027  if (NumElems == 4 && NumZero > 0) {
6028    for (unsigned i = 0; i < 4; ++i) {
6029      bool isZero = !(NonZeros & (1 << i));
6030      if (isZero)
6031        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
6032      else
6033        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6034    }
6035
6036    for (unsigned i = 0; i < 2; ++i) {
6037      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6038        default: break;
6039        case 0:
6040          V[i] = V[i*2];  // Must be a zero vector.
6041          break;
6042        case 1:
6043          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
6044          break;
6045        case 2:
6046          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
6047          break;
6048        case 3:
6049          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
6050          break;
6051      }
6052    }
6053
6054    bool Reverse1 = (NonZeros & 0x3) == 2;
6055    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6056    int MaskVec[] = {
6057      Reverse1 ? 1 : 0,
6058      Reverse1 ? 0 : 1,
6059      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6060      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
6061    };
6062    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
6063  }
6064
6065  if (Values.size() > 1 && VT.is128BitVector()) {
6066    // Check for a build vector of consecutive loads.
6067    for (unsigned i = 0; i < NumElems; ++i)
6068      V[i] = Op.getOperand(i);
6069
6070    // Check for elements which are consecutive loads.
6071    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
6072    if (LD.getNode())
6073      return LD;
6074
6075    // Check for a build vector from mostly shuffle plus few inserting.
6076    SDValue Sh = buildFromShuffleMostly(Op, DAG);
6077    if (Sh.getNode())
6078      return Sh;
6079
6080    // For SSE 4.1, use insertps to put the high elements into the low element.
6081    if (getSubtarget()->hasSSE41()) {
6082      SDValue Result;
6083      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
6084        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6085      else
6086        Result = DAG.getUNDEF(VT);
6087
6088      for (unsigned i = 1; i < NumElems; ++i) {
6089        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
6090        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6091                             Op.getOperand(i), DAG.getIntPtrConstant(i));
6092      }
6093      return Result;
6094    }
6095
6096    // Otherwise, expand into a number of unpckl*, start by extending each of
6097    // our (non-undef) elements to the full vector width with the element in the
6098    // bottom slot of the vector (which generates no code for SSE).
6099    for (unsigned i = 0; i < NumElems; ++i) {
6100      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
6101        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6102      else
6103        V[i] = DAG.getUNDEF(VT);
6104    }
6105
6106    // Next, we iteratively mix elements, e.g. for v4f32:
6107    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6108    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6109    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
6110    unsigned EltStride = NumElems >> 1;
6111    while (EltStride != 0) {
6112      for (unsigned i = 0; i < EltStride; ++i) {
6113        // If V[i+EltStride] is undef and this is the first round of mixing,
6114        // then it is safe to just drop this shuffle: V[i] is already in the
6115        // right place, the one element (since it's the first round) being
6116        // inserted as undef can be dropped.  This isn't safe for successive
6117        // rounds because they will permute elements within both vectors.
6118        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
6119            EltStride == NumElems/2)
6120          continue;
6121
6122        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
6123      }
6124      EltStride >>= 1;
6125    }
6126    return V[0];
6127  }
6128  return SDValue();
6129}
6130
6131// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
6132// to create 256-bit vectors from two other 128-bit ones.
6133static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6134  SDLoc dl(Op);
6135  MVT ResVT = Op.getSimpleValueType();
6136
6137  assert((ResVT.is256BitVector() ||
6138          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6139
6140  SDValue V1 = Op.getOperand(0);
6141  SDValue V2 = Op.getOperand(1);
6142  unsigned NumElems = ResVT.getVectorNumElements();
6143  if(ResVT.is256BitVector())
6144    return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6145
6146  return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6147}
6148
6149static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6150  assert(Op.getNumOperands() == 2);
6151
6152  // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
6153  // from two other 128-bit ones.
6154  return LowerAVXCONCAT_VECTORS(Op, DAG);
6155}
6156
6157// Try to lower a shuffle node into a simple blend instruction.
6158static SDValue
6159LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
6160                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
6161  SDValue V1 = SVOp->getOperand(0);
6162  SDValue V2 = SVOp->getOperand(1);
6163  SDLoc dl(SVOp);
6164  MVT VT = SVOp->getSimpleValueType(0);
6165  MVT EltVT = VT.getVectorElementType();
6166  unsigned NumElems = VT.getVectorNumElements();
6167
6168  // There is no blend with immediate in AVX-512.
6169  if (VT.is512BitVector())
6170    return SDValue();
6171
6172  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
6173    return SDValue();
6174  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
6175    return SDValue();
6176
6177  // Check the mask for BLEND and build the value.
6178  unsigned MaskValue = 0;
6179  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
6180  unsigned NumLanes = (NumElems-1)/8 + 1;
6181  unsigned NumElemsInLane = NumElems / NumLanes;
6182
6183  // Blend for v16i16 should be symetric for the both lanes.
6184  for (unsigned i = 0; i < NumElemsInLane; ++i) {
6185
6186    int SndLaneEltIdx = (NumLanes == 2) ?
6187      SVOp->getMaskElt(i + NumElemsInLane) : -1;
6188    int EltIdx = SVOp->getMaskElt(i);
6189
6190    if ((EltIdx < 0 || EltIdx == (int)i) &&
6191        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
6192      continue;
6193
6194    if (((unsigned)EltIdx == (i + NumElems)) &&
6195        (SndLaneEltIdx < 0 ||
6196         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
6197      MaskValue |= (1<<i);
6198    else
6199      return SDValue();
6200  }
6201
6202  // Convert i32 vectors to floating point if it is not AVX2.
6203  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
6204  MVT BlendVT = VT;
6205  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
6206    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
6207                               NumElems);
6208    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
6209    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
6210  }
6211
6212  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
6213                            DAG.getConstant(MaskValue, MVT::i32));
6214  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
6215}
6216
6217// v8i16 shuffles - Prefer shuffles in the following order:
6218// 1. [all]   pshuflw, pshufhw, optional move
6219// 2. [ssse3] 1 x pshufb
6220// 3. [ssse3] 2 x pshufb + 1 x por
6221// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
6222static SDValue
6223LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
6224                         SelectionDAG &DAG) {
6225  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6226  SDValue V1 = SVOp->getOperand(0);
6227  SDValue V2 = SVOp->getOperand(1);
6228  SDLoc dl(SVOp);
6229  SmallVector<int, 8> MaskVals;
6230
6231  // Determine if more than 1 of the words in each of the low and high quadwords
6232  // of the result come from the same quadword of one of the two inputs.  Undef
6233  // mask values count as coming from any quadword, for better codegen.
6234  unsigned LoQuad[] = { 0, 0, 0, 0 };
6235  unsigned HiQuad[] = { 0, 0, 0, 0 };
6236  std::bitset<4> InputQuads;
6237  for (unsigned i = 0; i < 8; ++i) {
6238    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
6239    int EltIdx = SVOp->getMaskElt(i);
6240    MaskVals.push_back(EltIdx);
6241    if (EltIdx < 0) {
6242      ++Quad[0];
6243      ++Quad[1];
6244      ++Quad[2];
6245      ++Quad[3];
6246      continue;
6247    }
6248    ++Quad[EltIdx / 4];
6249    InputQuads.set(EltIdx / 4);
6250  }
6251
6252  int BestLoQuad = -1;
6253  unsigned MaxQuad = 1;
6254  for (unsigned i = 0; i < 4; ++i) {
6255    if (LoQuad[i] > MaxQuad) {
6256      BestLoQuad = i;
6257      MaxQuad = LoQuad[i];
6258    }
6259  }
6260
6261  int BestHiQuad = -1;
6262  MaxQuad = 1;
6263  for (unsigned i = 0; i < 4; ++i) {
6264    if (HiQuad[i] > MaxQuad) {
6265      BestHiQuad = i;
6266      MaxQuad = HiQuad[i];
6267    }
6268  }
6269
6270  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
6271  // of the two input vectors, shuffle them into one input vector so only a
6272  // single pshufb instruction is necessary. If There are more than 2 input
6273  // quads, disable the next transformation since it does not help SSSE3.
6274  bool V1Used = InputQuads[0] || InputQuads[1];
6275  bool V2Used = InputQuads[2] || InputQuads[3];
6276  if (Subtarget->hasSSSE3()) {
6277    if (InputQuads.count() == 2 && V1Used && V2Used) {
6278      BestLoQuad = InputQuads[0] ? 0 : 1;
6279      BestHiQuad = InputQuads[2] ? 2 : 3;
6280    }
6281    if (InputQuads.count() > 2) {
6282      BestLoQuad = -1;
6283      BestHiQuad = -1;
6284    }
6285  }
6286
6287  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
6288  // the shuffle mask.  If a quad is scored as -1, that means that it contains
6289  // words from all 4 input quadwords.
6290  SDValue NewV;
6291  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
6292    int MaskV[] = {
6293      BestLoQuad < 0 ? 0 : BestLoQuad,
6294      BestHiQuad < 0 ? 1 : BestHiQuad
6295    };
6296    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
6297                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
6298                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
6299    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
6300
6301    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
6302    // source words for the shuffle, to aid later transformations.
6303    bool AllWordsInNewV = true;
6304    bool InOrder[2] = { true, true };
6305    for (unsigned i = 0; i != 8; ++i) {
6306      int idx = MaskVals[i];
6307      if (idx != (int)i)
6308        InOrder[i/4] = false;
6309      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
6310        continue;
6311      AllWordsInNewV = false;
6312      break;
6313    }
6314
6315    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
6316    if (AllWordsInNewV) {
6317      for (int i = 0; i != 8; ++i) {
6318        int idx = MaskVals[i];
6319        if (idx < 0)
6320          continue;
6321        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
6322        if ((idx != i) && idx < 4)
6323          pshufhw = false;
6324        if ((idx != i) && idx > 3)
6325          pshuflw = false;
6326      }
6327      V1 = NewV;
6328      V2Used = false;
6329      BestLoQuad = 0;
6330      BestHiQuad = 1;
6331    }
6332
6333    // If we've eliminated the use of V2, and the new mask is a pshuflw or
6334    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
6335    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
6336      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
6337      unsigned TargetMask = 0;
6338      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
6339                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
6340      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6341      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
6342                             getShufflePSHUFLWImmediate(SVOp);
6343      V1 = NewV.getOperand(0);
6344      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
6345    }
6346  }
6347
6348  // Promote splats to a larger type which usually leads to more efficient code.
6349  // FIXME: Is this true if pshufb is available?
6350  if (SVOp->isSplat())
6351    return PromoteSplat(SVOp, DAG);
6352
6353  // If we have SSSE3, and all words of the result are from 1 input vector,
6354  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
6355  // is present, fall back to case 4.
6356  if (Subtarget->hasSSSE3()) {
6357    SmallVector<SDValue,16> pshufbMask;
6358
6359    // If we have elements from both input vectors, set the high bit of the
6360    // shuffle mask element to zero out elements that come from V2 in the V1
6361    // mask, and elements that come from V1 in the V2 mask, so that the two
6362    // results can be OR'd together.
6363    bool TwoInputs = V1Used && V2Used;
6364    for (unsigned i = 0; i != 8; ++i) {
6365      int EltIdx = MaskVals[i] * 2;
6366      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
6367      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
6368      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
6369      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
6370    }
6371    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
6372    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
6373                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6374                                 MVT::v16i8, &pshufbMask[0], 16));
6375    if (!TwoInputs)
6376      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6377
6378    // Calculate the shuffle mask for the second input, shuffle it, and
6379    // OR it with the first shuffled input.
6380    pshufbMask.clear();
6381    for (unsigned i = 0; i != 8; ++i) {
6382      int EltIdx = MaskVals[i] * 2;
6383      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6384      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
6385      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
6386      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
6387    }
6388    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
6389    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6390                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6391                                 MVT::v16i8, &pshufbMask[0], 16));
6392    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6393    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6394  }
6395
6396  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
6397  // and update MaskVals with new element order.
6398  std::bitset<8> InOrder;
6399  if (BestLoQuad >= 0) {
6400    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
6401    for (int i = 0; i != 4; ++i) {
6402      int idx = MaskVals[i];
6403      if (idx < 0) {
6404        InOrder.set(i);
6405      } else if ((idx / 4) == BestLoQuad) {
6406        MaskV[i] = idx & 3;
6407        InOrder.set(i);
6408      }
6409    }
6410    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
6411                                &MaskV[0]);
6412
6413    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
6414      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6415      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
6416                                  NewV.getOperand(0),
6417                                  getShufflePSHUFLWImmediate(SVOp), DAG);
6418    }
6419  }
6420
6421  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
6422  // and update MaskVals with the new element order.
6423  if (BestHiQuad >= 0) {
6424    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
6425    for (unsigned i = 4; i != 8; ++i) {
6426      int idx = MaskVals[i];
6427      if (idx < 0) {
6428        InOrder.set(i);
6429      } else if ((idx / 4) == BestHiQuad) {
6430        MaskV[i] = (idx & 3) + 4;
6431        InOrder.set(i);
6432      }
6433    }
6434    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
6435                                &MaskV[0]);
6436
6437    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
6438      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
6439      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
6440                                  NewV.getOperand(0),
6441                                  getShufflePSHUFHWImmediate(SVOp), DAG);
6442    }
6443  }
6444
6445  // In case BestHi & BestLo were both -1, which means each quadword has a word
6446  // from each of the four input quadwords, calculate the InOrder bitvector now
6447  // before falling through to the insert/extract cleanup.
6448  if (BestLoQuad == -1 && BestHiQuad == -1) {
6449    NewV = V1;
6450    for (int i = 0; i != 8; ++i)
6451      if (MaskVals[i] < 0 || MaskVals[i] == i)
6452        InOrder.set(i);
6453  }
6454
6455  // The other elements are put in the right place using pextrw and pinsrw.
6456  for (unsigned i = 0; i != 8; ++i) {
6457    if (InOrder[i])
6458      continue;
6459    int EltIdx = MaskVals[i];
6460    if (EltIdx < 0)
6461      continue;
6462    SDValue ExtOp = (EltIdx < 8) ?
6463      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
6464                  DAG.getIntPtrConstant(EltIdx)) :
6465      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
6466                  DAG.getIntPtrConstant(EltIdx - 8));
6467    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
6468                       DAG.getIntPtrConstant(i));
6469  }
6470  return NewV;
6471}
6472
6473// v16i8 shuffles - Prefer shuffles in the following order:
6474// 1. [ssse3] 1 x pshufb
6475// 2. [ssse3] 2 x pshufb + 1 x por
6476// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
6477static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
6478                                        const X86Subtarget* Subtarget,
6479                                        SelectionDAG &DAG) {
6480  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6481  SDValue V1 = SVOp->getOperand(0);
6482  SDValue V2 = SVOp->getOperand(1);
6483  SDLoc dl(SVOp);
6484  ArrayRef<int> MaskVals = SVOp->getMask();
6485
6486  // Promote splats to a larger type which usually leads to more efficient code.
6487  // FIXME: Is this true if pshufb is available?
6488  if (SVOp->isSplat())
6489    return PromoteSplat(SVOp, DAG);
6490
6491  // If we have SSSE3, case 1 is generated when all result bytes come from
6492  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
6493  // present, fall back to case 3.
6494
6495  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
6496  if (Subtarget->hasSSSE3()) {
6497    SmallVector<SDValue,16> pshufbMask;
6498
6499    // If all result elements are from one input vector, then only translate
6500    // undef mask values to 0x80 (zero out result) in the pshufb mask.
6501    //
6502    // Otherwise, we have elements from both input vectors, and must zero out
6503    // elements that come from V2 in the first mask, and V1 in the second mask
6504    // so that we can OR them together.
6505    for (unsigned i = 0; i != 16; ++i) {
6506      int EltIdx = MaskVals[i];
6507      if (EltIdx < 0 || EltIdx >= 16)
6508        EltIdx = 0x80;
6509      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6510    }
6511    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
6512                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6513                                 MVT::v16i8, &pshufbMask[0], 16));
6514
6515    // As PSHUFB will zero elements with negative indices, it's safe to ignore
6516    // the 2nd operand if it's undefined or zero.
6517    if (V2.getOpcode() == ISD::UNDEF ||
6518        ISD::isBuildVectorAllZeros(V2.getNode()))
6519      return V1;
6520
6521    // Calculate the shuffle mask for the second input, shuffle it, and
6522    // OR it with the first shuffled input.
6523    pshufbMask.clear();
6524    for (unsigned i = 0; i != 16; ++i) {
6525      int EltIdx = MaskVals[i];
6526      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6527      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6528    }
6529    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6530                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6531                                 MVT::v16i8, &pshufbMask[0], 16));
6532    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6533  }
6534
6535  // No SSSE3 - Calculate in place words and then fix all out of place words
6536  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
6537  // the 16 different words that comprise the two doublequadword input vectors.
6538  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6539  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
6540  SDValue NewV = V1;
6541  for (int i = 0; i != 8; ++i) {
6542    int Elt0 = MaskVals[i*2];
6543    int Elt1 = MaskVals[i*2+1];
6544
6545    // This word of the result is all undef, skip it.
6546    if (Elt0 < 0 && Elt1 < 0)
6547      continue;
6548
6549    // This word of the result is already in the correct place, skip it.
6550    if ((Elt0 == i*2) && (Elt1 == i*2+1))
6551      continue;
6552
6553    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
6554    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
6555    SDValue InsElt;
6556
6557    // If Elt0 and Elt1 are defined, are consecutive, and can be load
6558    // using a single extract together, load it and store it.
6559    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
6560      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6561                           DAG.getIntPtrConstant(Elt1 / 2));
6562      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6563                        DAG.getIntPtrConstant(i));
6564      continue;
6565    }
6566
6567    // If Elt1 is defined, extract it from the appropriate source.  If the
6568    // source byte is not also odd, shift the extracted word left 8 bits
6569    // otherwise clear the bottom 8 bits if we need to do an or.
6570    if (Elt1 >= 0) {
6571      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6572                           DAG.getIntPtrConstant(Elt1 / 2));
6573      if ((Elt1 & 1) == 0)
6574        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
6575                             DAG.getConstant(8,
6576                                  TLI.getShiftAmountTy(InsElt.getValueType())));
6577      else if (Elt0 >= 0)
6578        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
6579                             DAG.getConstant(0xFF00, MVT::i16));
6580    }
6581    // If Elt0 is defined, extract it from the appropriate source.  If the
6582    // source byte is not also even, shift the extracted word right 8 bits. If
6583    // Elt1 was also defined, OR the extracted values together before
6584    // inserting them in the result.
6585    if (Elt0 >= 0) {
6586      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
6587                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
6588      if ((Elt0 & 1) != 0)
6589        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
6590                              DAG.getConstant(8,
6591                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
6592      else if (Elt1 >= 0)
6593        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
6594                             DAG.getConstant(0x00FF, MVT::i16));
6595      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
6596                         : InsElt0;
6597    }
6598    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6599                       DAG.getIntPtrConstant(i));
6600  }
6601  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
6602}
6603
6604// v32i8 shuffles - Translate to VPSHUFB if possible.
6605static
6606SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
6607                                 const X86Subtarget *Subtarget,
6608                                 SelectionDAG &DAG) {
6609  MVT VT = SVOp->getSimpleValueType(0);
6610  SDValue V1 = SVOp->getOperand(0);
6611  SDValue V2 = SVOp->getOperand(1);
6612  SDLoc dl(SVOp);
6613  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
6614
6615  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6616  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
6617  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
6618
6619  // VPSHUFB may be generated if
6620  // (1) one of input vector is undefined or zeroinitializer.
6621  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
6622  // And (2) the mask indexes don't cross the 128-bit lane.
6623  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
6624      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
6625    return SDValue();
6626
6627  if (V1IsAllZero && !V2IsAllZero) {
6628    CommuteVectorShuffleMask(MaskVals, 32);
6629    V1 = V2;
6630  }
6631  SmallVector<SDValue, 32> pshufbMask;
6632  for (unsigned i = 0; i != 32; i++) {
6633    int EltIdx = MaskVals[i];
6634    if (EltIdx < 0 || EltIdx >= 32)
6635      EltIdx = 0x80;
6636    else {
6637      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
6638        // Cross lane is not allowed.
6639        return SDValue();
6640      EltIdx &= 0xf;
6641    }
6642    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6643  }
6644  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
6645                      DAG.getNode(ISD::BUILD_VECTOR, dl,
6646                                  MVT::v32i8, &pshufbMask[0], 32));
6647}
6648
6649/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
6650/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
6651/// done when every pair / quad of shuffle mask elements point to elements in
6652/// the right sequence. e.g.
6653/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
6654static
6655SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
6656                                 SelectionDAG &DAG) {
6657  MVT VT = SVOp->getSimpleValueType(0);
6658  SDLoc dl(SVOp);
6659  unsigned NumElems = VT.getVectorNumElements();
6660  MVT NewVT;
6661  unsigned Scale;
6662  switch (VT.SimpleTy) {
6663  default: llvm_unreachable("Unexpected!");
6664  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
6665  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
6666  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
6667  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
6668  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
6669  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
6670  }
6671
6672  SmallVector<int, 8> MaskVec;
6673  for (unsigned i = 0; i != NumElems; i += Scale) {
6674    int StartIdx = -1;
6675    for (unsigned j = 0; j != Scale; ++j) {
6676      int EltIdx = SVOp->getMaskElt(i+j);
6677      if (EltIdx < 0)
6678        continue;
6679      if (StartIdx < 0)
6680        StartIdx = (EltIdx / Scale);
6681      if (EltIdx != (int)(StartIdx*Scale + j))
6682        return SDValue();
6683    }
6684    MaskVec.push_back(StartIdx);
6685  }
6686
6687  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
6688  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
6689  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
6690}
6691
6692/// getVZextMovL - Return a zero-extending vector move low node.
6693///
6694static SDValue getVZextMovL(MVT VT, MVT OpVT,
6695                            SDValue SrcOp, SelectionDAG &DAG,
6696                            const X86Subtarget *Subtarget, SDLoc dl) {
6697  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
6698    LoadSDNode *LD = NULL;
6699    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
6700      LD = dyn_cast<LoadSDNode>(SrcOp);
6701    if (!LD) {
6702      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
6703      // instead.
6704      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
6705      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
6706          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6707          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
6708          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
6709        // PR2108
6710        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
6711        return DAG.getNode(ISD::BITCAST, dl, VT,
6712                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6713                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6714                                                   OpVT,
6715                                                   SrcOp.getOperand(0)
6716                                                          .getOperand(0))));
6717      }
6718    }
6719  }
6720
6721  return DAG.getNode(ISD::BITCAST, dl, VT,
6722                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6723                                 DAG.getNode(ISD::BITCAST, dl,
6724                                             OpVT, SrcOp)));
6725}
6726
6727/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
6728/// which could not be matched by any known target speficic shuffle
6729static SDValue
6730LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6731
6732  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
6733  if (NewOp.getNode())
6734    return NewOp;
6735
6736  MVT VT = SVOp->getSimpleValueType(0);
6737
6738  unsigned NumElems = VT.getVectorNumElements();
6739  unsigned NumLaneElems = NumElems / 2;
6740
6741  SDLoc dl(SVOp);
6742  MVT EltVT = VT.getVectorElementType();
6743  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
6744  SDValue Output[2];
6745
6746  SmallVector<int, 16> Mask;
6747  for (unsigned l = 0; l < 2; ++l) {
6748    // Build a shuffle mask for the output, discovering on the fly which
6749    // input vectors to use as shuffle operands (recorded in InputUsed).
6750    // If building a suitable shuffle vector proves too hard, then bail
6751    // out with UseBuildVector set.
6752    bool UseBuildVector = false;
6753    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
6754    unsigned LaneStart = l * NumLaneElems;
6755    for (unsigned i = 0; i != NumLaneElems; ++i) {
6756      // The mask element.  This indexes into the input.
6757      int Idx = SVOp->getMaskElt(i+LaneStart);
6758      if (Idx < 0) {
6759        // the mask element does not index into any input vector.
6760        Mask.push_back(-1);
6761        continue;
6762      }
6763
6764      // The input vector this mask element indexes into.
6765      int Input = Idx / NumLaneElems;
6766
6767      // Turn the index into an offset from the start of the input vector.
6768      Idx -= Input * NumLaneElems;
6769
6770      // Find or create a shuffle vector operand to hold this input.
6771      unsigned OpNo;
6772      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6773        if (InputUsed[OpNo] == Input)
6774          // This input vector is already an operand.
6775          break;
6776        if (InputUsed[OpNo] < 0) {
6777          // Create a new operand for this input vector.
6778          InputUsed[OpNo] = Input;
6779          break;
6780        }
6781      }
6782
6783      if (OpNo >= array_lengthof(InputUsed)) {
6784        // More than two input vectors used!  Give up on trying to create a
6785        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
6786        UseBuildVector = true;
6787        break;
6788      }
6789
6790      // Add the mask index for the new shuffle vector.
6791      Mask.push_back(Idx + OpNo * NumLaneElems);
6792    }
6793
6794    if (UseBuildVector) {
6795      SmallVector<SDValue, 16> SVOps;
6796      for (unsigned i = 0; i != NumLaneElems; ++i) {
6797        // The mask element.  This indexes into the input.
6798        int Idx = SVOp->getMaskElt(i+LaneStart);
6799        if (Idx < 0) {
6800          SVOps.push_back(DAG.getUNDEF(EltVT));
6801          continue;
6802        }
6803
6804        // The input vector this mask element indexes into.
6805        int Input = Idx / NumElems;
6806
6807        // Turn the index into an offset from the start of the input vector.
6808        Idx -= Input * NumElems;
6809
6810        // Extract the vector element by hand.
6811        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6812                                    SVOp->getOperand(Input),
6813                                    DAG.getIntPtrConstant(Idx)));
6814      }
6815
6816      // Construct the output using a BUILD_VECTOR.
6817      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
6818                              SVOps.size());
6819    } else if (InputUsed[0] < 0) {
6820      // No input vectors were used! The result is undefined.
6821      Output[l] = DAG.getUNDEF(NVT);
6822    } else {
6823      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6824                                        (InputUsed[0] % 2) * NumLaneElems,
6825                                        DAG, dl);
6826      // If only one input was used, use an undefined vector for the other.
6827      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6828        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6829                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6830      // At least one input vector was used. Create a new shuffle vector.
6831      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6832    }
6833
6834    Mask.clear();
6835  }
6836
6837  // Concatenate the result back
6838  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
6839}
6840
6841/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6842/// 4 elements, and match them with several different shuffle types.
6843static SDValue
6844LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6845  SDValue V1 = SVOp->getOperand(0);
6846  SDValue V2 = SVOp->getOperand(1);
6847  SDLoc dl(SVOp);
6848  MVT VT = SVOp->getSimpleValueType(0);
6849
6850  assert(VT.is128BitVector() && "Unsupported vector size");
6851
6852  std::pair<int, int> Locs[4];
6853  int Mask1[] = { -1, -1, -1, -1 };
6854  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6855
6856  unsigned NumHi = 0;
6857  unsigned NumLo = 0;
6858  for (unsigned i = 0; i != 4; ++i) {
6859    int Idx = PermMask[i];
6860    if (Idx < 0) {
6861      Locs[i] = std::make_pair(-1, -1);
6862    } else {
6863      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6864      if (Idx < 4) {
6865        Locs[i] = std::make_pair(0, NumLo);
6866        Mask1[NumLo] = Idx;
6867        NumLo++;
6868      } else {
6869        Locs[i] = std::make_pair(1, NumHi);
6870        if (2+NumHi < 4)
6871          Mask1[2+NumHi] = Idx;
6872        NumHi++;
6873      }
6874    }
6875  }
6876
6877  if (NumLo <= 2 && NumHi <= 2) {
6878    // If no more than two elements come from either vector. This can be
6879    // implemented with two shuffles. First shuffle gather the elements.
6880    // The second shuffle, which takes the first shuffle as both of its
6881    // vector operands, put the elements into the right order.
6882    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6883
6884    int Mask2[] = { -1, -1, -1, -1 };
6885
6886    for (unsigned i = 0; i != 4; ++i)
6887      if (Locs[i].first != -1) {
6888        unsigned Idx = (i < 2) ? 0 : 4;
6889        Idx += Locs[i].first * 2 + Locs[i].second;
6890        Mask2[i] = Idx;
6891      }
6892
6893    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6894  }
6895
6896  if (NumLo == 3 || NumHi == 3) {
6897    // Otherwise, we must have three elements from one vector, call it X, and
6898    // one element from the other, call it Y.  First, use a shufps to build an
6899    // intermediate vector with the one element from Y and the element from X
6900    // that will be in the same half in the final destination (the indexes don't
6901    // matter). Then, use a shufps to build the final vector, taking the half
6902    // containing the element from Y from the intermediate, and the other half
6903    // from X.
6904    if (NumHi == 3) {
6905      // Normalize it so the 3 elements come from V1.
6906      CommuteVectorShuffleMask(PermMask, 4);
6907      std::swap(V1, V2);
6908    }
6909
6910    // Find the element from V2.
6911    unsigned HiIndex;
6912    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6913      int Val = PermMask[HiIndex];
6914      if (Val < 0)
6915        continue;
6916      if (Val >= 4)
6917        break;
6918    }
6919
6920    Mask1[0] = PermMask[HiIndex];
6921    Mask1[1] = -1;
6922    Mask1[2] = PermMask[HiIndex^1];
6923    Mask1[3] = -1;
6924    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6925
6926    if (HiIndex >= 2) {
6927      Mask1[0] = PermMask[0];
6928      Mask1[1] = PermMask[1];
6929      Mask1[2] = HiIndex & 1 ? 6 : 4;
6930      Mask1[3] = HiIndex & 1 ? 4 : 6;
6931      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6932    }
6933
6934    Mask1[0] = HiIndex & 1 ? 2 : 0;
6935    Mask1[1] = HiIndex & 1 ? 0 : 2;
6936    Mask1[2] = PermMask[2];
6937    Mask1[3] = PermMask[3];
6938    if (Mask1[2] >= 0)
6939      Mask1[2] += 4;
6940    if (Mask1[3] >= 0)
6941      Mask1[3] += 4;
6942    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6943  }
6944
6945  // Break it into (shuffle shuffle_hi, shuffle_lo).
6946  int LoMask[] = { -1, -1, -1, -1 };
6947  int HiMask[] = { -1, -1, -1, -1 };
6948
6949  int *MaskPtr = LoMask;
6950  unsigned MaskIdx = 0;
6951  unsigned LoIdx = 0;
6952  unsigned HiIdx = 2;
6953  for (unsigned i = 0; i != 4; ++i) {
6954    if (i == 2) {
6955      MaskPtr = HiMask;
6956      MaskIdx = 1;
6957      LoIdx = 0;
6958      HiIdx = 2;
6959    }
6960    int Idx = PermMask[i];
6961    if (Idx < 0) {
6962      Locs[i] = std::make_pair(-1, -1);
6963    } else if (Idx < 4) {
6964      Locs[i] = std::make_pair(MaskIdx, LoIdx);
6965      MaskPtr[LoIdx] = Idx;
6966      LoIdx++;
6967    } else {
6968      Locs[i] = std::make_pair(MaskIdx, HiIdx);
6969      MaskPtr[HiIdx] = Idx;
6970      HiIdx++;
6971    }
6972  }
6973
6974  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6975  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6976  int MaskOps[] = { -1, -1, -1, -1 };
6977  for (unsigned i = 0; i != 4; ++i)
6978    if (Locs[i].first != -1)
6979      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6980  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6981}
6982
6983static bool MayFoldVectorLoad(SDValue V) {
6984  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6985    V = V.getOperand(0);
6986
6987  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6988    V = V.getOperand(0);
6989  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6990      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6991    // BUILD_VECTOR (load), undef
6992    V = V.getOperand(0);
6993
6994  return MayFoldLoad(V);
6995}
6996
6997static
6998SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
6999  MVT VT = Op.getSimpleValueType();
7000
7001  // Canonizalize to v2f64.
7002  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
7003  return DAG.getNode(ISD::BITCAST, dl, VT,
7004                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
7005                                          V1, DAG));
7006}
7007
7008static
7009SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
7010                        bool HasSSE2) {
7011  SDValue V1 = Op.getOperand(0);
7012  SDValue V2 = Op.getOperand(1);
7013  MVT VT = Op.getSimpleValueType();
7014
7015  assert(VT != MVT::v2i64 && "unsupported shuffle type");
7016
7017  if (HasSSE2 && VT == MVT::v2f64)
7018    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
7019
7020  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
7021  return DAG.getNode(ISD::BITCAST, dl, VT,
7022                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
7023                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
7024                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
7025}
7026
7027static
7028SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
7029  SDValue V1 = Op.getOperand(0);
7030  SDValue V2 = Op.getOperand(1);
7031  MVT VT = Op.getSimpleValueType();
7032
7033  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
7034         "unsupported shuffle type");
7035
7036  if (V2.getOpcode() == ISD::UNDEF)
7037    V2 = V1;
7038
7039  // v4i32 or v4f32
7040  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
7041}
7042
7043static
7044SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
7045  SDValue V1 = Op.getOperand(0);
7046  SDValue V2 = Op.getOperand(1);
7047  MVT VT = Op.getSimpleValueType();
7048  unsigned NumElems = VT.getVectorNumElements();
7049
7050  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
7051  // operand of these instructions is only memory, so check if there's a
7052  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
7053  // same masks.
7054  bool CanFoldLoad = false;
7055
7056  // Trivial case, when V2 comes from a load.
7057  if (MayFoldVectorLoad(V2))
7058    CanFoldLoad = true;
7059
7060  // When V1 is a load, it can be folded later into a store in isel, example:
7061  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
7062  //    turns into:
7063  //  (MOVLPSmr addr:$src1, VR128:$src2)
7064  // So, recognize this potential and also use MOVLPS or MOVLPD
7065  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
7066    CanFoldLoad = true;
7067
7068  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7069  if (CanFoldLoad) {
7070    if (HasSSE2 && NumElems == 2)
7071      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
7072
7073    if (NumElems == 4)
7074      // If we don't care about the second element, proceed to use movss.
7075      if (SVOp->getMaskElt(1) != -1)
7076        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
7077  }
7078
7079  // movl and movlp will both match v2i64, but v2i64 is never matched by
7080  // movl earlier because we make it strict to avoid messing with the movlp load
7081  // folding logic (see the code above getMOVLP call). Match it here then,
7082  // this is horrible, but will stay like this until we move all shuffle
7083  // matching to x86 specific nodes. Note that for the 1st condition all
7084  // types are matched with movsd.
7085  if (HasSSE2) {
7086    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
7087    // as to remove this logic from here, as much as possible
7088    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
7089      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
7090    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
7091  }
7092
7093  assert(VT != MVT::v4i32 && "unsupported shuffle type");
7094
7095  // Invert the operand order and use SHUFPS to match it.
7096  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
7097                              getShuffleSHUFImmediate(SVOp), DAG);
7098}
7099
7100// Reduce a vector shuffle to zext.
7101static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
7102                                    SelectionDAG &DAG) {
7103  // PMOVZX is only available from SSE41.
7104  if (!Subtarget->hasSSE41())
7105    return SDValue();
7106
7107  MVT VT = Op.getSimpleValueType();
7108
7109  // Only AVX2 support 256-bit vector integer extending.
7110  if (!Subtarget->hasInt256() && VT.is256BitVector())
7111    return SDValue();
7112
7113  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7114  SDLoc DL(Op);
7115  SDValue V1 = Op.getOperand(0);
7116  SDValue V2 = Op.getOperand(1);
7117  unsigned NumElems = VT.getVectorNumElements();
7118
7119  // Extending is an unary operation and the element type of the source vector
7120  // won't be equal to or larger than i64.
7121  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
7122      VT.getVectorElementType() == MVT::i64)
7123    return SDValue();
7124
7125  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
7126  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
7127  while ((1U << Shift) < NumElems) {
7128    if (SVOp->getMaskElt(1U << Shift) == 1)
7129      break;
7130    Shift += 1;
7131    // The maximal ratio is 8, i.e. from i8 to i64.
7132    if (Shift > 3)
7133      return SDValue();
7134  }
7135
7136  // Check the shuffle mask.
7137  unsigned Mask = (1U << Shift) - 1;
7138  for (unsigned i = 0; i != NumElems; ++i) {
7139    int EltIdx = SVOp->getMaskElt(i);
7140    if ((i & Mask) != 0 && EltIdx != -1)
7141      return SDValue();
7142    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
7143      return SDValue();
7144  }
7145
7146  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
7147  MVT NeVT = MVT::getIntegerVT(NBits);
7148  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
7149
7150  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
7151    return SDValue();
7152
7153  // Simplify the operand as it's prepared to be fed into shuffle.
7154  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
7155  if (V1.getOpcode() == ISD::BITCAST &&
7156      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7157      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7158      V1.getOperand(0).getOperand(0)
7159        .getSimpleValueType().getSizeInBits() == SignificantBits) {
7160    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
7161    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
7162    ConstantSDNode *CIdx =
7163      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
7164    // If it's foldable, i.e. normal load with single use, we will let code
7165    // selection to fold it. Otherwise, we will short the conversion sequence.
7166    if (CIdx && CIdx->getZExtValue() == 0 &&
7167        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
7168      MVT FullVT = V.getSimpleValueType();
7169      MVT V1VT = V1.getSimpleValueType();
7170      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
7171        // The "ext_vec_elt" node is wider than the result node.
7172        // In this case we should extract subvector from V.
7173        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
7174        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
7175        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
7176                                        FullVT.getVectorNumElements()/Ratio);
7177        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
7178                        DAG.getIntPtrConstant(0));
7179      }
7180      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
7181    }
7182  }
7183
7184  return DAG.getNode(ISD::BITCAST, DL, VT,
7185                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
7186}
7187
7188static SDValue
7189NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
7190                       SelectionDAG &DAG) {
7191  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7192  MVT VT = Op.getSimpleValueType();
7193  SDLoc dl(Op);
7194  SDValue V1 = Op.getOperand(0);
7195  SDValue V2 = Op.getOperand(1);
7196
7197  if (isZeroShuffle(SVOp))
7198    return getZeroVector(VT, Subtarget, DAG, dl);
7199
7200  // Handle splat operations
7201  if (SVOp->isSplat()) {
7202    // Use vbroadcast whenever the splat comes from a foldable load
7203    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
7204    if (Broadcast.getNode())
7205      return Broadcast;
7206  }
7207
7208  // Check integer expanding shuffles.
7209  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
7210  if (NewOp.getNode())
7211    return NewOp;
7212
7213  // If the shuffle can be profitably rewritten as a narrower shuffle, then
7214  // do it!
7215  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
7216      VT == MVT::v16i16 || VT == MVT::v32i8) {
7217    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
7218    if (NewOp.getNode())
7219      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
7220  } else if ((VT == MVT::v4i32 ||
7221             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
7222    // FIXME: Figure out a cleaner way to do this.
7223    // Try to make use of movq to zero out the top part.
7224    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
7225      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
7226      if (NewOp.getNode()) {
7227        MVT NewVT = NewOp.getSimpleValueType();
7228        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
7229                               NewVT, true, false))
7230          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
7231                              DAG, Subtarget, dl);
7232      }
7233    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
7234      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
7235      if (NewOp.getNode()) {
7236        MVT NewVT = NewOp.getSimpleValueType();
7237        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
7238          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
7239                              DAG, Subtarget, dl);
7240      }
7241    }
7242  }
7243  return SDValue();
7244}
7245
7246SDValue
7247X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
7248  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
7249  SDValue V1 = Op.getOperand(0);
7250  SDValue V2 = Op.getOperand(1);
7251  MVT VT = Op.getSimpleValueType();
7252  SDLoc dl(Op);
7253  unsigned NumElems = VT.getVectorNumElements();
7254  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
7255  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
7256  bool V1IsSplat = false;
7257  bool V2IsSplat = false;
7258  bool HasSSE2 = Subtarget->hasSSE2();
7259  bool HasFp256    = Subtarget->hasFp256();
7260  bool HasInt256   = Subtarget->hasInt256();
7261  MachineFunction &MF = DAG.getMachineFunction();
7262  bool OptForSize = MF.getFunction()->getAttributes().
7263    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
7264
7265  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
7266
7267  if (V1IsUndef && V2IsUndef)
7268    return DAG.getUNDEF(VT);
7269
7270  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
7271
7272  // Vector shuffle lowering takes 3 steps:
7273  //
7274  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
7275  //    narrowing and commutation of operands should be handled.
7276  // 2) Matching of shuffles with known shuffle masks to x86 target specific
7277  //    shuffle nodes.
7278  // 3) Rewriting of unmatched masks into new generic shuffle operations,
7279  //    so the shuffle can be broken into other shuffles and the legalizer can
7280  //    try the lowering again.
7281  //
7282  // The general idea is that no vector_shuffle operation should be left to
7283  // be matched during isel, all of them must be converted to a target specific
7284  // node here.
7285
7286  // Normalize the input vectors. Here splats, zeroed vectors, profitable
7287  // narrowing and commutation of operands should be handled. The actual code
7288  // doesn't include all of those, work in progress...
7289  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
7290  if (NewOp.getNode())
7291    return NewOp;
7292
7293  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
7294
7295  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
7296  // unpckh_undef). Only use pshufd if speed is more important than size.
7297  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
7298    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7299  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
7300    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7301
7302  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
7303      V2IsUndef && MayFoldVectorLoad(V1))
7304    return getMOVDDup(Op, dl, V1, DAG);
7305
7306  if (isMOVHLPS_v_undef_Mask(M, VT))
7307    return getMOVHighToLow(Op, dl, DAG);
7308
7309  // Use to match splats
7310  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
7311      (VT == MVT::v2f64 || VT == MVT::v2i64))
7312    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7313
7314  if (isPSHUFDMask(M, VT)) {
7315    // The actual implementation will match the mask in the if above and then
7316    // during isel it can match several different instructions, not only pshufd
7317    // as its name says, sad but true, emulate the behavior for now...
7318    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
7319      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
7320
7321    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
7322
7323    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
7324      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
7325
7326    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
7327      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
7328                                  DAG);
7329
7330    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
7331                                TargetMask, DAG);
7332  }
7333
7334  if (isPALIGNRMask(M, VT, Subtarget))
7335    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
7336                                getShufflePALIGNRImmediate(SVOp),
7337                                DAG);
7338
7339  // Check if this can be converted into a logical shift.
7340  bool isLeft = false;
7341  unsigned ShAmt = 0;
7342  SDValue ShVal;
7343  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
7344  if (isShift && ShVal.hasOneUse()) {
7345    // If the shifted value has multiple uses, it may be cheaper to use
7346    // v_set0 + movlhps or movhlps, etc.
7347    MVT EltVT = VT.getVectorElementType();
7348    ShAmt *= EltVT.getSizeInBits();
7349    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
7350  }
7351
7352  if (isMOVLMask(M, VT)) {
7353    if (ISD::isBuildVectorAllZeros(V1.getNode()))
7354      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
7355    if (!isMOVLPMask(M, VT)) {
7356      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
7357        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
7358
7359      if (VT == MVT::v4i32 || VT == MVT::v4f32)
7360        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
7361    }
7362  }
7363
7364  // FIXME: fold these into legal mask.
7365  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
7366    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
7367
7368  if (isMOVHLPSMask(M, VT))
7369    return getMOVHighToLow(Op, dl, DAG);
7370
7371  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
7372    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
7373
7374  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
7375    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
7376
7377  if (isMOVLPMask(M, VT))
7378    return getMOVLP(Op, dl, DAG, HasSSE2);
7379
7380  if (ShouldXformToMOVHLPS(M, VT) ||
7381      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
7382    return CommuteVectorShuffle(SVOp, DAG);
7383
7384  if (isShift) {
7385    // No better options. Use a vshldq / vsrldq.
7386    MVT EltVT = VT.getVectorElementType();
7387    ShAmt *= EltVT.getSizeInBits();
7388    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
7389  }
7390
7391  bool Commuted = false;
7392  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
7393  // 1,1,1,1 -> v8i16 though.
7394  V1IsSplat = isSplatVector(V1.getNode());
7395  V2IsSplat = isSplatVector(V2.getNode());
7396
7397  // Canonicalize the splat or undef, if present, to be on the RHS.
7398  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
7399    CommuteVectorShuffleMask(M, NumElems);
7400    std::swap(V1, V2);
7401    std::swap(V1IsSplat, V2IsSplat);
7402    Commuted = true;
7403  }
7404
7405  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
7406    // Shuffling low element of v1 into undef, just return v1.
7407    if (V2IsUndef)
7408      return V1;
7409    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
7410    // the instruction selector will not match, so get a canonical MOVL with
7411    // swapped operands to undo the commute.
7412    return getMOVL(DAG, dl, VT, V2, V1);
7413  }
7414
7415  if (isUNPCKLMask(M, VT, HasInt256))
7416    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7417
7418  if (isUNPCKHMask(M, VT, HasInt256))
7419    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7420
7421  if (V2IsSplat) {
7422    // Normalize mask so all entries that point to V2 points to its first
7423    // element then try to match unpck{h|l} again. If match, return a
7424    // new vector_shuffle with the corrected mask.p
7425    SmallVector<int, 8> NewMask(M.begin(), M.end());
7426    NormalizeMask(NewMask, NumElems);
7427    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
7428      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7429    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
7430      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7431  }
7432
7433  if (Commuted) {
7434    // Commute is back and try unpck* again.
7435    // FIXME: this seems wrong.
7436    CommuteVectorShuffleMask(M, NumElems);
7437    std::swap(V1, V2);
7438    std::swap(V1IsSplat, V2IsSplat);
7439    Commuted = false;
7440
7441    if (isUNPCKLMask(M, VT, HasInt256))
7442      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
7443
7444    if (isUNPCKHMask(M, VT, HasInt256))
7445      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
7446  }
7447
7448  // Normalize the node to match x86 shuffle ops if needed
7449  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
7450    return CommuteVectorShuffle(SVOp, DAG);
7451
7452  // The checks below are all present in isShuffleMaskLegal, but they are
7453  // inlined here right now to enable us to directly emit target specific
7454  // nodes, and remove one by one until they don't return Op anymore.
7455
7456  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
7457      SVOp->getSplatIndex() == 0 && V2IsUndef) {
7458    if (VT == MVT::v2f64 || VT == MVT::v2i64)
7459      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7460  }
7461
7462  if (isPSHUFHWMask(M, VT, HasInt256))
7463    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
7464                                getShufflePSHUFHWImmediate(SVOp),
7465                                DAG);
7466
7467  if (isPSHUFLWMask(M, VT, HasInt256))
7468    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
7469                                getShufflePSHUFLWImmediate(SVOp),
7470                                DAG);
7471
7472  if (isSHUFPMask(M, VT))
7473    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
7474                                getShuffleSHUFImmediate(SVOp), DAG);
7475
7476  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
7477    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
7478  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
7479    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
7480
7481  //===--------------------------------------------------------------------===//
7482  // Generate target specific nodes for 128 or 256-bit shuffles only
7483  // supported in the AVX instruction set.
7484  //
7485
7486  // Handle VMOVDDUPY permutations
7487  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
7488    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
7489
7490  // Handle VPERMILPS/D* permutations
7491  if (isVPERMILPMask(M, VT)) {
7492    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
7493      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
7494                                  getShuffleSHUFImmediate(SVOp), DAG);
7495    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
7496                                getShuffleSHUFImmediate(SVOp), DAG);
7497  }
7498
7499  // Handle VPERM2F128/VPERM2I128 permutations
7500  if (isVPERM2X128Mask(M, VT, HasFp256))
7501    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
7502                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
7503
7504  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
7505  if (BlendOp.getNode())
7506    return BlendOp;
7507
7508  unsigned Imm8;
7509  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
7510    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
7511
7512  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
7513      VT.is512BitVector()) {
7514    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
7515    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
7516    SmallVector<SDValue, 16> permclMask;
7517    for (unsigned i = 0; i != NumElems; ++i) {
7518      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
7519    }
7520
7521    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
7522                                &permclMask[0], NumElems);
7523    if (V2IsUndef)
7524      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
7525      return DAG.getNode(X86ISD::VPERMV, dl, VT,
7526                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
7527    return DAG.getNode(X86ISD::VPERMV3, dl, VT,
7528                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
7529  }
7530
7531  //===--------------------------------------------------------------------===//
7532  // Since no target specific shuffle was selected for this generic one,
7533  // lower it into other known shuffles. FIXME: this isn't true yet, but
7534  // this is the plan.
7535  //
7536
7537  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
7538  if (VT == MVT::v8i16) {
7539    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
7540    if (NewOp.getNode())
7541      return NewOp;
7542  }
7543
7544  if (VT == MVT::v16i8) {
7545    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
7546    if (NewOp.getNode())
7547      return NewOp;
7548  }
7549
7550  if (VT == MVT::v32i8) {
7551    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
7552    if (NewOp.getNode())
7553      return NewOp;
7554  }
7555
7556  // Handle all 128-bit wide vectors with 4 elements, and match them with
7557  // several different shuffle types.
7558  if (NumElems == 4 && VT.is128BitVector())
7559    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
7560
7561  // Handle general 256-bit shuffles
7562  if (VT.is256BitVector())
7563    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
7564
7565  return SDValue();
7566}
7567
7568static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
7569  MVT VT = Op.getSimpleValueType();
7570  SDLoc dl(Op);
7571
7572  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
7573    return SDValue();
7574
7575  if (VT.getSizeInBits() == 8) {
7576    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
7577                                  Op.getOperand(0), Op.getOperand(1));
7578    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7579                                  DAG.getValueType(VT));
7580    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7581  }
7582
7583  if (VT.getSizeInBits() == 16) {
7584    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7585    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
7586    if (Idx == 0)
7587      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7588                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7589                                     DAG.getNode(ISD::BITCAST, dl,
7590                                                 MVT::v4i32,
7591                                                 Op.getOperand(0)),
7592                                     Op.getOperand(1)));
7593    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
7594                                  Op.getOperand(0), Op.getOperand(1));
7595    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7596                                  DAG.getValueType(VT));
7597    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7598  }
7599
7600  if (VT == MVT::f32) {
7601    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
7602    // the result back to FR32 register. It's only worth matching if the
7603    // result has a single use which is a store or a bitcast to i32.  And in
7604    // the case of a store, it's not worth it if the index is a constant 0,
7605    // because a MOVSSmr can be used instead, which is smaller and faster.
7606    if (!Op.hasOneUse())
7607      return SDValue();
7608    SDNode *User = *Op.getNode()->use_begin();
7609    if ((User->getOpcode() != ISD::STORE ||
7610         (isa<ConstantSDNode>(Op.getOperand(1)) &&
7611          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
7612        (User->getOpcode() != ISD::BITCAST ||
7613         User->getValueType(0) != MVT::i32))
7614      return SDValue();
7615    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7616                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
7617                                              Op.getOperand(0)),
7618                                              Op.getOperand(1));
7619    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
7620  }
7621
7622  if (VT == MVT::i32 || VT == MVT::i64) {
7623    // ExtractPS/pextrq works with constant index.
7624    if (isa<ConstantSDNode>(Op.getOperand(1)))
7625      return Op;
7626  }
7627  return SDValue();
7628}
7629
7630SDValue
7631X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
7632                                           SelectionDAG &DAG) const {
7633  SDLoc dl(Op);
7634  SDValue Vec = Op.getOperand(0);
7635  MVT VecVT = Vec.getSimpleValueType();
7636  SDValue Idx = Op.getOperand(1);
7637  if (!isa<ConstantSDNode>(Idx)) {
7638    if (VecVT.is512BitVector() ||
7639        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
7640         VecVT.getVectorElementType().getSizeInBits() == 32)) {
7641
7642      MVT MaskEltVT =
7643        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
7644      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
7645                                    MaskEltVT.getSizeInBits());
7646
7647      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
7648      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
7649                                getZeroVector(MaskVT, Subtarget, DAG, dl),
7650                                Idx, DAG.getConstant(0, getPointerTy()));
7651      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
7652      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
7653                        Perm, DAG.getConstant(0, getPointerTy()));
7654    }
7655    return SDValue();
7656  }
7657
7658  // If this is a 256-bit vector result, first extract the 128-bit vector and
7659  // then extract the element from the 128-bit vector.
7660  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
7661
7662    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7663    // Get the 128-bit vector.
7664    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
7665    MVT EltVT = VecVT.getVectorElementType();
7666
7667    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
7668
7669    //if (IdxVal >= NumElems/2)
7670    //  IdxVal -= NumElems/2;
7671    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
7672    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
7673                       DAG.getConstant(IdxVal, MVT::i32));
7674  }
7675
7676  assert(VecVT.is128BitVector() && "Unexpected vector length");
7677
7678  if (Subtarget->hasSSE41()) {
7679    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
7680    if (Res.getNode())
7681      return Res;
7682  }
7683
7684  MVT VT = Op.getSimpleValueType();
7685  // TODO: handle v16i8.
7686  if (VT.getSizeInBits() == 16) {
7687    SDValue Vec = Op.getOperand(0);
7688    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7689    if (Idx == 0)
7690      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7691                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7692                                     DAG.getNode(ISD::BITCAST, dl,
7693                                                 MVT::v4i32, Vec),
7694                                     Op.getOperand(1)));
7695    // Transform it so it match pextrw which produces a 32-bit result.
7696    MVT EltVT = MVT::i32;
7697    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
7698                                  Op.getOperand(0), Op.getOperand(1));
7699    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
7700                                  DAG.getValueType(VT));
7701    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7702  }
7703
7704  if (VT.getSizeInBits() == 32) {
7705    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7706    if (Idx == 0)
7707      return Op;
7708
7709    // SHUFPS the element to the lowest double word, then movss.
7710    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
7711    MVT VVT = Op.getOperand(0).getSimpleValueType();
7712    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7713                                       DAG.getUNDEF(VVT), Mask);
7714    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7715                       DAG.getIntPtrConstant(0));
7716  }
7717
7718  if (VT.getSizeInBits() == 64) {
7719    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
7720    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
7721    //        to match extract_elt for f64.
7722    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7723    if (Idx == 0)
7724      return Op;
7725
7726    // UNPCKHPD the element to the lowest double word, then movsd.
7727    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
7728    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
7729    int Mask[2] = { 1, -1 };
7730    MVT VVT = Op.getOperand(0).getSimpleValueType();
7731    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7732                                       DAG.getUNDEF(VVT), Mask);
7733    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7734                       DAG.getIntPtrConstant(0));
7735  }
7736
7737  return SDValue();
7738}
7739
7740static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
7741  MVT VT = Op.getSimpleValueType();
7742  MVT EltVT = VT.getVectorElementType();
7743  SDLoc dl(Op);
7744
7745  SDValue N0 = Op.getOperand(0);
7746  SDValue N1 = Op.getOperand(1);
7747  SDValue N2 = Op.getOperand(2);
7748
7749  if (!VT.is128BitVector())
7750    return SDValue();
7751
7752  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
7753      isa<ConstantSDNode>(N2)) {
7754    unsigned Opc;
7755    if (VT == MVT::v8i16)
7756      Opc = X86ISD::PINSRW;
7757    else if (VT == MVT::v16i8)
7758      Opc = X86ISD::PINSRB;
7759    else
7760      Opc = X86ISD::PINSRB;
7761
7762    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
7763    // argument.
7764    if (N1.getValueType() != MVT::i32)
7765      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7766    if (N2.getValueType() != MVT::i32)
7767      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7768    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
7769  }
7770
7771  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
7772    // Bits [7:6] of the constant are the source select.  This will always be
7773    //  zero here.  The DAG Combiner may combine an extract_elt index into these
7774    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
7775    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
7776    // Bits [5:4] of the constant are the destination select.  This is the
7777    //  value of the incoming immediate.
7778    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
7779    //   combine either bitwise AND or insert of float 0.0 to set these bits.
7780    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
7781    // Create this as a scalar to vector..
7782    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
7783    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
7784  }
7785
7786  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
7787    // PINSR* works with constant index.
7788    return Op;
7789  }
7790  return SDValue();
7791}
7792
7793SDValue
7794X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
7795  MVT VT = Op.getSimpleValueType();
7796  MVT EltVT = VT.getVectorElementType();
7797
7798  SDLoc dl(Op);
7799  SDValue N0 = Op.getOperand(0);
7800  SDValue N1 = Op.getOperand(1);
7801  SDValue N2 = Op.getOperand(2);
7802
7803  // If this is a 256-bit vector result, first extract the 128-bit vector,
7804  // insert the element into the extracted half and then place it back.
7805  if (VT.is256BitVector() || VT.is512BitVector()) {
7806    if (!isa<ConstantSDNode>(N2))
7807      return SDValue();
7808
7809    // Get the desired 128-bit vector half.
7810    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
7811    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
7812
7813    // Insert the element into the desired half.
7814    unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
7815    unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
7816
7817    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
7818                    DAG.getConstant(IdxIn128, MVT::i32));
7819
7820    // Insert the changed part back to the 256-bit vector
7821    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
7822  }
7823
7824  if (Subtarget->hasSSE41())
7825    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
7826
7827  if (EltVT == MVT::i8)
7828    return SDValue();
7829
7830  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
7831    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
7832    // as its second argument.
7833    if (N1.getValueType() != MVT::i32)
7834      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7835    if (N2.getValueType() != MVT::i32)
7836      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7837    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
7838  }
7839  return SDValue();
7840}
7841
7842static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
7843  SDLoc dl(Op);
7844  MVT OpVT = Op.getSimpleValueType();
7845
7846  // If this is a 256-bit vector result, first insert into a 128-bit
7847  // vector and then insert into the 256-bit vector.
7848  if (!OpVT.is128BitVector()) {
7849    // Insert into a 128-bit vector.
7850    unsigned SizeFactor = OpVT.getSizeInBits()/128;
7851    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
7852                                 OpVT.getVectorNumElements() / SizeFactor);
7853
7854    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
7855
7856    // Insert the 128-bit vector.
7857    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
7858  }
7859
7860  if (OpVT == MVT::v1i64 &&
7861      Op.getOperand(0).getValueType() == MVT::i64)
7862    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
7863
7864  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
7865  assert(OpVT.is128BitVector() && "Expected an SSE type!");
7866  return DAG.getNode(ISD::BITCAST, dl, OpVT,
7867                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
7868}
7869
7870// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
7871// a simple subregister reference or explicit instructions to grab
7872// upper bits of a vector.
7873static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7874                                      SelectionDAG &DAG) {
7875  SDLoc dl(Op);
7876  SDValue In =  Op.getOperand(0);
7877  SDValue Idx = Op.getOperand(1);
7878  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7879  MVT ResVT   = Op.getSimpleValueType();
7880  MVT InVT    = In.getSimpleValueType();
7881
7882  if (Subtarget->hasFp256()) {
7883    if (ResVT.is128BitVector() &&
7884        (InVT.is256BitVector() || InVT.is512BitVector()) &&
7885        isa<ConstantSDNode>(Idx)) {
7886      return Extract128BitVector(In, IdxVal, DAG, dl);
7887    }
7888    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
7889        isa<ConstantSDNode>(Idx)) {
7890      return Extract256BitVector(In, IdxVal, DAG, dl);
7891    }
7892  }
7893  return SDValue();
7894}
7895
7896// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
7897// simple superregister reference or explicit instructions to insert
7898// the upper bits of a vector.
7899static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7900                                     SelectionDAG &DAG) {
7901  if (Subtarget->hasFp256()) {
7902    SDLoc dl(Op.getNode());
7903    SDValue Vec = Op.getNode()->getOperand(0);
7904    SDValue SubVec = Op.getNode()->getOperand(1);
7905    SDValue Idx = Op.getNode()->getOperand(2);
7906
7907    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
7908         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
7909        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
7910        isa<ConstantSDNode>(Idx)) {
7911      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7912      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7913    }
7914
7915    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
7916        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
7917        isa<ConstantSDNode>(Idx)) {
7918      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7919      return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
7920    }
7921  }
7922  return SDValue();
7923}
7924
7925// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7926// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7927// one of the above mentioned nodes. It has to be wrapped because otherwise
7928// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7929// be used to form addressing mode. These wrapped nodes will be selected
7930// into MOV32ri.
7931SDValue
7932X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7933  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7934
7935  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7936  // global base reg.
7937  unsigned char OpFlag = 0;
7938  unsigned WrapperKind = X86ISD::Wrapper;
7939  CodeModel::Model M = getTargetMachine().getCodeModel();
7940
7941  if (Subtarget->isPICStyleRIPRel() &&
7942      (M == CodeModel::Small || M == CodeModel::Kernel))
7943    WrapperKind = X86ISD::WrapperRIP;
7944  else if (Subtarget->isPICStyleGOT())
7945    OpFlag = X86II::MO_GOTOFF;
7946  else if (Subtarget->isPICStyleStubPIC())
7947    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7948
7949  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7950                                             CP->getAlignment(),
7951                                             CP->getOffset(), OpFlag);
7952  SDLoc DL(CP);
7953  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7954  // With PIC, the address is actually $g + Offset.
7955  if (OpFlag) {
7956    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7957                         DAG.getNode(X86ISD::GlobalBaseReg,
7958                                     SDLoc(), getPointerTy()),
7959                         Result);
7960  }
7961
7962  return Result;
7963}
7964
7965SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7966  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7967
7968  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7969  // global base reg.
7970  unsigned char OpFlag = 0;
7971  unsigned WrapperKind = X86ISD::Wrapper;
7972  CodeModel::Model M = getTargetMachine().getCodeModel();
7973
7974  if (Subtarget->isPICStyleRIPRel() &&
7975      (M == CodeModel::Small || M == CodeModel::Kernel))
7976    WrapperKind = X86ISD::WrapperRIP;
7977  else if (Subtarget->isPICStyleGOT())
7978    OpFlag = X86II::MO_GOTOFF;
7979  else if (Subtarget->isPICStyleStubPIC())
7980    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7981
7982  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7983                                          OpFlag);
7984  SDLoc DL(JT);
7985  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7986
7987  // With PIC, the address is actually $g + Offset.
7988  if (OpFlag)
7989    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7990                         DAG.getNode(X86ISD::GlobalBaseReg,
7991                                     SDLoc(), getPointerTy()),
7992                         Result);
7993
7994  return Result;
7995}
7996
7997SDValue
7998X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
7999  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
8000
8001  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
8002  // global base reg.
8003  unsigned char OpFlag = 0;
8004  unsigned WrapperKind = X86ISD::Wrapper;
8005  CodeModel::Model M = getTargetMachine().getCodeModel();
8006
8007  if (Subtarget->isPICStyleRIPRel() &&
8008      (M == CodeModel::Small || M == CodeModel::Kernel)) {
8009    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
8010      OpFlag = X86II::MO_GOTPCREL;
8011    WrapperKind = X86ISD::WrapperRIP;
8012  } else if (Subtarget->isPICStyleGOT()) {
8013    OpFlag = X86II::MO_GOT;
8014  } else if (Subtarget->isPICStyleStubPIC()) {
8015    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
8016  } else if (Subtarget->isPICStyleStubNoDynamic()) {
8017    OpFlag = X86II::MO_DARWIN_NONLAZY;
8018  }
8019
8020  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
8021
8022  SDLoc DL(Op);
8023  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
8024
8025  // With PIC, the address is actually $g + Offset.
8026  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
8027      !Subtarget->is64Bit()) {
8028    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8029                         DAG.getNode(X86ISD::GlobalBaseReg,
8030                                     SDLoc(), getPointerTy()),
8031                         Result);
8032  }
8033
8034  // For symbols that require a load from a stub to get the address, emit the
8035  // load.
8036  if (isGlobalStubReference(OpFlag))
8037    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
8038                         MachinePointerInfo::getGOT(), false, false, false, 0);
8039
8040  return Result;
8041}
8042
8043SDValue
8044X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
8045  // Create the TargetBlockAddressAddress node.
8046  unsigned char OpFlags =
8047    Subtarget->ClassifyBlockAddressReference();
8048  CodeModel::Model M = getTargetMachine().getCodeModel();
8049  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
8050  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
8051  SDLoc dl(Op);
8052  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
8053                                             OpFlags);
8054
8055  if (Subtarget->isPICStyleRIPRel() &&
8056      (M == CodeModel::Small || M == CodeModel::Kernel))
8057    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
8058  else
8059    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
8060
8061  // With PIC, the address is actually $g + Offset.
8062  if (isGlobalRelativeToPICBase(OpFlags)) {
8063    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
8064                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
8065                         Result);
8066  }
8067
8068  return Result;
8069}
8070
8071SDValue
8072X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
8073                                      int64_t Offset, SelectionDAG &DAG) const {
8074  // Create the TargetGlobalAddress node, folding in the constant
8075  // offset if it is legal.
8076  unsigned char OpFlags =
8077    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8078  CodeModel::Model M = getTargetMachine().getCodeModel();
8079  SDValue Result;
8080  if (OpFlags == X86II::MO_NO_FLAG &&
8081      X86::isOffsetSuitableForCodeModel(Offset, M)) {
8082    // A direct static reference to a global.
8083    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
8084    Offset = 0;
8085  } else {
8086    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
8087  }
8088
8089  if (Subtarget->isPICStyleRIPRel() &&
8090      (M == CodeModel::Small || M == CodeModel::Kernel))
8091    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
8092  else
8093    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
8094
8095  // With PIC, the address is actually $g + Offset.
8096  if (isGlobalRelativeToPICBase(OpFlags)) {
8097    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
8098                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
8099                         Result);
8100  }
8101
8102  // For globals that require a load from a stub to get the address, emit the
8103  // load.
8104  if (isGlobalStubReference(OpFlags))
8105    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
8106                         MachinePointerInfo::getGOT(), false, false, false, 0);
8107
8108  // If there was a non-zero offset that we didn't fold, create an explicit
8109  // addition for it.
8110  if (Offset != 0)
8111    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
8112                         DAG.getConstant(Offset, getPointerTy()));
8113
8114  return Result;
8115}
8116
8117SDValue
8118X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
8119  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8120  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
8121  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
8122}
8123
8124static SDValue
8125GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
8126           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
8127           unsigned char OperandFlags, bool LocalDynamic = false) {
8128  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
8129  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8130  SDLoc dl(GA);
8131  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
8132                                           GA->getValueType(0),
8133                                           GA->getOffset(),
8134                                           OperandFlags);
8135
8136  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
8137                                           : X86ISD::TLSADDR;
8138
8139  if (InFlag) {
8140    SDValue Ops[] = { Chain,  TGA, *InFlag };
8141    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
8142  } else {
8143    SDValue Ops[]  = { Chain, TGA };
8144    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
8145  }
8146
8147  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
8148  MFI->setAdjustsStack(true);
8149
8150  SDValue Flag = Chain.getValue(1);
8151  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
8152}
8153
8154// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
8155static SDValue
8156LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
8157                                const EVT PtrVT) {
8158  SDValue InFlag;
8159  SDLoc dl(GA);  // ? function entry point might be better
8160  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
8161                                   DAG.getNode(X86ISD::GlobalBaseReg,
8162                                               SDLoc(), PtrVT), InFlag);
8163  InFlag = Chain.getValue(1);
8164
8165  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
8166}
8167
8168// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
8169static SDValue
8170LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
8171                                const EVT PtrVT) {
8172  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
8173                    X86::RAX, X86II::MO_TLSGD);
8174}
8175
8176static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
8177                                           SelectionDAG &DAG,
8178                                           const EVT PtrVT,
8179                                           bool is64Bit) {
8180  SDLoc dl(GA);
8181
8182  // Get the start address of the TLS block for this module.
8183  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
8184      .getInfo<X86MachineFunctionInfo>();
8185  MFI->incNumLocalDynamicTLSAccesses();
8186
8187  SDValue Base;
8188  if (is64Bit) {
8189    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
8190                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
8191  } else {
8192    SDValue InFlag;
8193    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
8194        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
8195    InFlag = Chain.getValue(1);
8196    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
8197                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
8198  }
8199
8200  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
8201  // of Base.
8202
8203  // Build x@dtpoff.
8204  unsigned char OperandFlags = X86II::MO_DTPOFF;
8205  unsigned WrapperKind = X86ISD::Wrapper;
8206  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
8207                                           GA->getValueType(0),
8208                                           GA->getOffset(), OperandFlags);
8209  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
8210
8211  // Add x@dtpoff with the base.
8212  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
8213}
8214
8215// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
8216static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
8217                                   const EVT PtrVT, TLSModel::Model model,
8218                                   bool is64Bit, bool isPIC) {
8219  SDLoc dl(GA);
8220
8221  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
8222  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
8223                                                         is64Bit ? 257 : 256));
8224
8225  SDValue ThreadPointer =
8226      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
8227                  MachinePointerInfo(Ptr), false, false, false, 0);
8228
8229  unsigned char OperandFlags = 0;
8230  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
8231  // initialexec.
8232  unsigned WrapperKind = X86ISD::Wrapper;
8233  if (model == TLSModel::LocalExec) {
8234    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
8235  } else if (model == TLSModel::InitialExec) {
8236    if (is64Bit) {
8237      OperandFlags = X86II::MO_GOTTPOFF;
8238      WrapperKind = X86ISD::WrapperRIP;
8239    } else {
8240      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
8241    }
8242  } else {
8243    llvm_unreachable("Unexpected model");
8244  }
8245
8246  // emit "addl x@ntpoff,%eax" (local exec)
8247  // or "addl x@indntpoff,%eax" (initial exec)
8248  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
8249  SDValue TGA =
8250      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
8251                                 GA->getOffset(), OperandFlags);
8252  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
8253
8254  if (model == TLSModel::InitialExec) {
8255    if (isPIC && !is64Bit) {
8256      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
8257                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
8258                           Offset);
8259    }
8260
8261    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
8262                         MachinePointerInfo::getGOT(), false, false, false, 0);
8263  }
8264
8265  // The address of the thread local variable is the add of the thread
8266  // pointer with the offset of the variable.
8267  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
8268}
8269
8270SDValue
8271X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
8272
8273  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8274  const GlobalValue *GV = GA->getGlobal();
8275
8276  if (Subtarget->isTargetELF()) {
8277    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
8278
8279    switch (model) {
8280      case TLSModel::GeneralDynamic:
8281        if (Subtarget->is64Bit())
8282          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
8283        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
8284      case TLSModel::LocalDynamic:
8285        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
8286                                           Subtarget->is64Bit());
8287      case TLSModel::InitialExec:
8288      case TLSModel::LocalExec:
8289        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
8290                                   Subtarget->is64Bit(),
8291                        getTargetMachine().getRelocationModel() == Reloc::PIC_);
8292    }
8293    llvm_unreachable("Unknown TLS model.");
8294  }
8295
8296  if (Subtarget->isTargetDarwin()) {
8297    // Darwin only has one model of TLS.  Lower to that.
8298    unsigned char OpFlag = 0;
8299    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
8300                           X86ISD::WrapperRIP : X86ISD::Wrapper;
8301
8302    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
8303    // global base reg.
8304    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
8305                  !Subtarget->is64Bit();
8306    if (PIC32)
8307      OpFlag = X86II::MO_TLVP_PIC_BASE;
8308    else
8309      OpFlag = X86II::MO_TLVP;
8310    SDLoc DL(Op);
8311    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
8312                                                GA->getValueType(0),
8313                                                GA->getOffset(), OpFlag);
8314    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
8315
8316    // With PIC32, the address is actually $g + Offset.
8317    if (PIC32)
8318      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8319                           DAG.getNode(X86ISD::GlobalBaseReg,
8320                                       SDLoc(), getPointerTy()),
8321                           Offset);
8322
8323    // Lowering the machine isd will make sure everything is in the right
8324    // location.
8325    SDValue Chain = DAG.getEntryNode();
8326    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8327    SDValue Args[] = { Chain, Offset };
8328    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
8329
8330    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
8331    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
8332    MFI->setAdjustsStack(true);
8333
8334    // And our return value (tls address) is in the standard call return value
8335    // location.
8336    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
8337    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
8338                              Chain.getValue(1));
8339  }
8340
8341  if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
8342    // Just use the implicit TLS architecture
8343    // Need to generate someting similar to:
8344    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
8345    //                                  ; from TEB
8346    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
8347    //   mov     rcx, qword [rdx+rcx*8]
8348    //   mov     eax, .tls$:tlsvar
8349    //   [rax+rcx] contains the address
8350    // Windows 64bit: gs:0x58
8351    // Windows 32bit: fs:__tls_array
8352
8353    // If GV is an alias then use the aliasee for determining
8354    // thread-localness.
8355    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
8356      GV = GA->resolveAliasedGlobal(false);
8357    SDLoc dl(GA);
8358    SDValue Chain = DAG.getEntryNode();
8359
8360    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
8361    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
8362    // use its literal value of 0x2C.
8363    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
8364                                        ? Type::getInt8PtrTy(*DAG.getContext(),
8365                                                             256)
8366                                        : Type::getInt32PtrTy(*DAG.getContext(),
8367                                                              257));
8368
8369    SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
8370      (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
8371        DAG.getExternalSymbol("_tls_array", getPointerTy()));
8372
8373    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
8374                                        MachinePointerInfo(Ptr),
8375                                        false, false, false, 0);
8376
8377    // Load the _tls_index variable
8378    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
8379    if (Subtarget->is64Bit())
8380      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
8381                           IDX, MachinePointerInfo(), MVT::i32,
8382                           false, false, 0);
8383    else
8384      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
8385                        false, false, false, 0);
8386
8387    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
8388                                    getPointerTy());
8389    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
8390
8391    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
8392    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
8393                      false, false, false, 0);
8394
8395    // Get the offset of start of .tls section
8396    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
8397                                             GA->getValueType(0),
8398                                             GA->getOffset(), X86II::MO_SECREL);
8399    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
8400
8401    // The address of the thread local variable is the add of the thread
8402    // pointer with the offset of the variable.
8403    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
8404  }
8405
8406  llvm_unreachable("TLS not implemented for this target.");
8407}
8408
8409/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
8410/// and take a 2 x i32 value to shift plus a shift amount.
8411SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
8412  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
8413  EVT VT = Op.getValueType();
8414  unsigned VTBits = VT.getSizeInBits();
8415  SDLoc dl(Op);
8416  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
8417  SDValue ShOpLo = Op.getOperand(0);
8418  SDValue ShOpHi = Op.getOperand(1);
8419  SDValue ShAmt  = Op.getOperand(2);
8420  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
8421                                     DAG.getConstant(VTBits - 1, MVT::i8))
8422                       : DAG.getConstant(0, VT);
8423
8424  SDValue Tmp2, Tmp3;
8425  if (Op.getOpcode() == ISD::SHL_PARTS) {
8426    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
8427    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
8428  } else {
8429    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
8430    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
8431  }
8432
8433  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
8434                                DAG.getConstant(VTBits, MVT::i8));
8435  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
8436                             AndNode, DAG.getConstant(0, MVT::i8));
8437
8438  SDValue Hi, Lo;
8439  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
8440  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
8441  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
8442
8443  if (Op.getOpcode() == ISD::SHL_PARTS) {
8444    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
8445    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
8446  } else {
8447    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
8448    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
8449  }
8450
8451  SDValue Ops[2] = { Lo, Hi };
8452  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
8453}
8454
8455SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
8456                                           SelectionDAG &DAG) const {
8457  EVT SrcVT = Op.getOperand(0).getValueType();
8458
8459  if (SrcVT.isVector())
8460    return SDValue();
8461
8462  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
8463         "Unknown SINT_TO_FP to lower!");
8464
8465  // These are really Legal; return the operand so the caller accepts it as
8466  // Legal.
8467  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
8468    return Op;
8469  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
8470      Subtarget->is64Bit()) {
8471    return Op;
8472  }
8473
8474  SDLoc dl(Op);
8475  unsigned Size = SrcVT.getSizeInBits()/8;
8476  MachineFunction &MF = DAG.getMachineFunction();
8477  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
8478  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8479  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8480                               StackSlot,
8481                               MachinePointerInfo::getFixedStack(SSFI),
8482                               false, false, 0);
8483  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
8484}
8485
8486SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
8487                                     SDValue StackSlot,
8488                                     SelectionDAG &DAG) const {
8489  // Build the FILD
8490  SDLoc DL(Op);
8491  SDVTList Tys;
8492  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
8493  if (useSSE)
8494    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
8495  else
8496    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
8497
8498  unsigned ByteSize = SrcVT.getSizeInBits()/8;
8499
8500  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
8501  MachineMemOperand *MMO;
8502  if (FI) {
8503    int SSFI = FI->getIndex();
8504    MMO =
8505      DAG.getMachineFunction()
8506      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8507                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
8508  } else {
8509    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
8510    StackSlot = StackSlot.getOperand(1);
8511  }
8512  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
8513  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
8514                                           X86ISD::FILD, DL,
8515                                           Tys, Ops, array_lengthof(Ops),
8516                                           SrcVT, MMO);
8517
8518  if (useSSE) {
8519    Chain = Result.getValue(1);
8520    SDValue InFlag = Result.getValue(2);
8521
8522    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
8523    // shouldn't be necessary except that RFP cannot be live across
8524    // multiple blocks. When stackifier is fixed, they can be uncoupled.
8525    MachineFunction &MF = DAG.getMachineFunction();
8526    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
8527    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
8528    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8529    Tys = DAG.getVTList(MVT::Other);
8530    SDValue Ops[] = {
8531      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
8532    };
8533    MachineMemOperand *MMO =
8534      DAG.getMachineFunction()
8535      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8536                            MachineMemOperand::MOStore, SSFISize, SSFISize);
8537
8538    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
8539                                    Ops, array_lengthof(Ops),
8540                                    Op.getValueType(), MMO);
8541    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
8542                         MachinePointerInfo::getFixedStack(SSFI),
8543                         false, false, false, 0);
8544  }
8545
8546  return Result;
8547}
8548
8549// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
8550SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
8551                                               SelectionDAG &DAG) const {
8552  // This algorithm is not obvious. Here it is what we're trying to output:
8553  /*
8554     movq       %rax,  %xmm0
8555     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
8556     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
8557     #ifdef __SSE3__
8558       haddpd   %xmm0, %xmm0
8559     #else
8560       pshufd   $0x4e, %xmm0, %xmm1
8561       addpd    %xmm1, %xmm0
8562     #endif
8563  */
8564
8565  SDLoc dl(Op);
8566  LLVMContext *Context = DAG.getContext();
8567
8568  // Build some magic constants.
8569  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
8570  Constant *C0 = ConstantDataVector::get(*Context, CV0);
8571  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
8572
8573  SmallVector<Constant*,2> CV1;
8574  CV1.push_back(
8575    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
8576                                      APInt(64, 0x4330000000000000ULL))));
8577  CV1.push_back(
8578    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
8579                                      APInt(64, 0x4530000000000000ULL))));
8580  Constant *C1 = ConstantVector::get(CV1);
8581  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
8582
8583  // Load the 64-bit value into an XMM register.
8584  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
8585                            Op.getOperand(0));
8586  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
8587                              MachinePointerInfo::getConstantPool(),
8588                              false, false, false, 16);
8589  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
8590                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
8591                              CLod0);
8592
8593  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
8594                              MachinePointerInfo::getConstantPool(),
8595                              false, false, false, 16);
8596  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
8597  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
8598  SDValue Result;
8599
8600  if (Subtarget->hasSSE3()) {
8601    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
8602    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
8603  } else {
8604    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
8605    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
8606                                           S2F, 0x4E, DAG);
8607    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
8608                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
8609                         Sub);
8610  }
8611
8612  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
8613                     DAG.getIntPtrConstant(0));
8614}
8615
8616// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
8617SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
8618                                               SelectionDAG &DAG) const {
8619  SDLoc dl(Op);
8620  // FP constant to bias correct the final result.
8621  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
8622                                   MVT::f64);
8623
8624  // Load the 32-bit value into an XMM register.
8625  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
8626                             Op.getOperand(0));
8627
8628  // Zero out the upper parts of the register.
8629  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
8630
8631  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8632                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
8633                     DAG.getIntPtrConstant(0));
8634
8635  // Or the load with the bias.
8636  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
8637                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8638                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8639                                                   MVT::v2f64, Load)),
8640                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8641                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8642                                                   MVT::v2f64, Bias)));
8643  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8644                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
8645                   DAG.getIntPtrConstant(0));
8646
8647  // Subtract the bias.
8648  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
8649
8650  // Handle final rounding.
8651  EVT DestVT = Op.getValueType();
8652
8653  if (DestVT.bitsLT(MVT::f64))
8654    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
8655                       DAG.getIntPtrConstant(0));
8656  if (DestVT.bitsGT(MVT::f64))
8657    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
8658
8659  // Handle final rounding.
8660  return Sub;
8661}
8662
8663SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
8664                                               SelectionDAG &DAG) const {
8665  SDValue N0 = Op.getOperand(0);
8666  EVT SVT = N0.getValueType();
8667  SDLoc dl(Op);
8668
8669  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
8670          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
8671         "Custom UINT_TO_FP is not supported!");
8672
8673  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
8674                             SVT.getVectorNumElements());
8675  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
8676                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
8677}
8678
8679SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
8680                                           SelectionDAG &DAG) const {
8681  SDValue N0 = Op.getOperand(0);
8682  SDLoc dl(Op);
8683
8684  if (Op.getValueType().isVector())
8685    return lowerUINT_TO_FP_vec(Op, DAG);
8686
8687  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
8688  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
8689  // the optimization here.
8690  if (DAG.SignBitIsZero(N0))
8691    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
8692
8693  EVT SrcVT = N0.getValueType();
8694  EVT DstVT = Op.getValueType();
8695  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
8696    return LowerUINT_TO_FP_i64(Op, DAG);
8697  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
8698    return LowerUINT_TO_FP_i32(Op, DAG);
8699  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
8700    return SDValue();
8701
8702  // Make a 64-bit buffer, and use it to build an FILD.
8703  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
8704  if (SrcVT == MVT::i32) {
8705    SDValue WordOff = DAG.getConstant(4, getPointerTy());
8706    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
8707                                     getPointerTy(), StackSlot, WordOff);
8708    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8709                                  StackSlot, MachinePointerInfo(),
8710                                  false, false, 0);
8711    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
8712                                  OffsetSlot, MachinePointerInfo(),
8713                                  false, false, 0);
8714    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
8715    return Fild;
8716  }
8717
8718  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
8719  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8720                               StackSlot, MachinePointerInfo(),
8721                               false, false, 0);
8722  // For i64 source, we need to add the appropriate power of 2 if the input
8723  // was negative.  This is the same as the optimization in
8724  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
8725  // we must be careful to do the computation in x87 extended precision, not
8726  // in SSE. (The generic code can't know it's OK to do this, or how to.)
8727  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
8728  MachineMemOperand *MMO =
8729    DAG.getMachineFunction()
8730    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8731                          MachineMemOperand::MOLoad, 8, 8);
8732
8733  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
8734  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
8735  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
8736                                         array_lengthof(Ops), MVT::i64, MMO);
8737
8738  APInt FF(32, 0x5F800000ULL);
8739
8740  // Check whether the sign bit is set.
8741  SDValue SignSet = DAG.getSetCC(dl,
8742                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
8743                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
8744                                 ISD::SETLT);
8745
8746  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
8747  SDValue FudgePtr = DAG.getConstantPool(
8748                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
8749                                         getPointerTy());
8750
8751  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
8752  SDValue Zero = DAG.getIntPtrConstant(0);
8753  SDValue Four = DAG.getIntPtrConstant(4);
8754  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
8755                               Zero, Four);
8756  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
8757
8758  // Load the value out, extending it from f32 to f80.
8759  // FIXME: Avoid the extend by constructing the right constant pool?
8760  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
8761                                 FudgePtr, MachinePointerInfo::getConstantPool(),
8762                                 MVT::f32, false, false, 4);
8763  // Extend everything to 80 bits to force it to be done on x87.
8764  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
8765  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
8766}
8767
8768std::pair<SDValue,SDValue>
8769X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
8770                                    bool IsSigned, bool IsReplace) const {
8771  SDLoc DL(Op);
8772
8773  EVT DstTy = Op.getValueType();
8774
8775  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
8776    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
8777    DstTy = MVT::i64;
8778  }
8779
8780  assert(DstTy.getSimpleVT() <= MVT::i64 &&
8781         DstTy.getSimpleVT() >= MVT::i16 &&
8782         "Unknown FP_TO_INT to lower!");
8783
8784  // These are really Legal.
8785  if (DstTy == MVT::i32 &&
8786      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8787    return std::make_pair(SDValue(), SDValue());
8788  if (Subtarget->is64Bit() &&
8789      DstTy == MVT::i64 &&
8790      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8791    return std::make_pair(SDValue(), SDValue());
8792
8793  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
8794  // stack slot, or into the FTOL runtime function.
8795  MachineFunction &MF = DAG.getMachineFunction();
8796  unsigned MemSize = DstTy.getSizeInBits()/8;
8797  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8798  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8799
8800  unsigned Opc;
8801  if (!IsSigned && isIntegerTypeFTOL(DstTy))
8802    Opc = X86ISD::WIN_FTOL;
8803  else
8804    switch (DstTy.getSimpleVT().SimpleTy) {
8805    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
8806    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
8807    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
8808    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
8809    }
8810
8811  SDValue Chain = DAG.getEntryNode();
8812  SDValue Value = Op.getOperand(0);
8813  EVT TheVT = Op.getOperand(0).getValueType();
8814  // FIXME This causes a redundant load/store if the SSE-class value is already
8815  // in memory, such as if it is on the callstack.
8816  if (isScalarFPTypeInSSEReg(TheVT)) {
8817    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
8818    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
8819                         MachinePointerInfo::getFixedStack(SSFI),
8820                         false, false, 0);
8821    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
8822    SDValue Ops[] = {
8823      Chain, StackSlot, DAG.getValueType(TheVT)
8824    };
8825
8826    MachineMemOperand *MMO =
8827      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8828                              MachineMemOperand::MOLoad, MemSize, MemSize);
8829    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
8830                                    array_lengthof(Ops), DstTy, MMO);
8831    Chain = Value.getValue(1);
8832    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8833    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8834  }
8835
8836  MachineMemOperand *MMO =
8837    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8838                            MachineMemOperand::MOStore, MemSize, MemSize);
8839
8840  if (Opc != X86ISD::WIN_FTOL) {
8841    // Build the FP_TO_INT*_IN_MEM
8842    SDValue Ops[] = { Chain, Value, StackSlot };
8843    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
8844                                           Ops, array_lengthof(Ops), DstTy,
8845                                           MMO);
8846    return std::make_pair(FIST, StackSlot);
8847  } else {
8848    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
8849      DAG.getVTList(MVT::Other, MVT::Glue),
8850      Chain, Value);
8851    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
8852      MVT::i32, ftol.getValue(1));
8853    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
8854      MVT::i32, eax.getValue(2));
8855    SDValue Ops[] = { eax, edx };
8856    SDValue pair = IsReplace
8857      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
8858      : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
8859    return std::make_pair(pair, SDValue());
8860  }
8861}
8862
8863static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
8864                              const X86Subtarget *Subtarget) {
8865  MVT VT = Op->getSimpleValueType(0);
8866  SDValue In = Op->getOperand(0);
8867  MVT InVT = In.getSimpleValueType();
8868  SDLoc dl(Op);
8869
8870  // Optimize vectors in AVX mode:
8871  //
8872  //   v8i16 -> v8i32
8873  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
8874  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
8875  //   Concat upper and lower parts.
8876  //
8877  //   v4i32 -> v4i64
8878  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
8879  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
8880  //   Concat upper and lower parts.
8881  //
8882
8883  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
8884      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
8885      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
8886    return SDValue();
8887
8888  if (Subtarget->hasInt256())
8889    return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
8890
8891  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
8892  SDValue Undef = DAG.getUNDEF(InVT);
8893  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
8894  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
8895  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
8896
8897  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
8898                             VT.getVectorNumElements()/2);
8899
8900  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
8901  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
8902
8903  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
8904}
8905
8906static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
8907                                        SelectionDAG &DAG) {
8908  MVT VT = Op->getValueType(0).getSimpleVT();
8909  SDValue In = Op->getOperand(0);
8910  MVT InVT = In.getValueType().getSimpleVT();
8911  SDLoc DL(Op);
8912  unsigned int NumElts = VT.getVectorNumElements();
8913  if (NumElts != 8 && NumElts != 16)
8914    return SDValue();
8915
8916  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
8917    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
8918
8919  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
8920  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8921  // Now we have only mask extension
8922  assert(InVT.getVectorElementType() == MVT::i1);
8923  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
8924  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
8925  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
8926  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8927  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
8928                           MachinePointerInfo::getConstantPool(),
8929                           false, false, false, Alignment);
8930
8931  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
8932  if (VT.is512BitVector())
8933    return Brcst;
8934  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
8935}
8936
8937static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
8938                               SelectionDAG &DAG) {
8939  if (Subtarget->hasFp256()) {
8940    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
8941    if (Res.getNode())
8942      return Res;
8943  }
8944
8945  return SDValue();
8946}
8947
8948static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
8949                                SelectionDAG &DAG) {
8950  SDLoc DL(Op);
8951  MVT VT = Op.getSimpleValueType();
8952  SDValue In = Op.getOperand(0);
8953  MVT SVT = In.getSimpleValueType();
8954
8955  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
8956    return LowerZERO_EXTEND_AVX512(Op, DAG);
8957
8958  if (Subtarget->hasFp256()) {
8959    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
8960    if (Res.getNode())
8961      return Res;
8962  }
8963
8964  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
8965         VT.getVectorNumElements() != SVT.getVectorNumElements());
8966  return SDValue();
8967}
8968
8969SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8970  SDLoc DL(Op);
8971  MVT VT = Op.getSimpleValueType();
8972  SDValue In = Op.getOperand(0);
8973  MVT InVT = In.getSimpleValueType();
8974  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
8975         "Invalid TRUNCATE operation");
8976
8977  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
8978    if (VT.getVectorElementType().getSizeInBits() >=8)
8979      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
8980
8981    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
8982    unsigned NumElts = InVT.getVectorNumElements();
8983    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
8984    if (InVT.getSizeInBits() < 512) {
8985      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
8986      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
8987      InVT = ExtVT;
8988    }
8989    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
8990    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
8991    SDValue CP = DAG.getConstantPool(C, getPointerTy());
8992    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8993    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
8994                           MachinePointerInfo::getConstantPool(),
8995                           false, false, false, Alignment);
8996    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
8997    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
8998    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
8999  }
9000
9001  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
9002    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
9003    if (Subtarget->hasInt256()) {
9004      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
9005      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
9006      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
9007                                ShufMask);
9008      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
9009                         DAG.getIntPtrConstant(0));
9010    }
9011
9012    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
9013    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
9014                               DAG.getIntPtrConstant(0));
9015    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
9016                               DAG.getIntPtrConstant(2));
9017
9018    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
9019    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
9020
9021    // The PSHUFD mask:
9022    static const int ShufMask1[] = {0, 2, 0, 0};
9023    SDValue Undef = DAG.getUNDEF(VT);
9024    OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
9025    OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
9026
9027    // The MOVLHPS mask:
9028    static const int ShufMask2[] = {0, 1, 4, 5};
9029    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
9030  }
9031
9032  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
9033    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
9034    if (Subtarget->hasInt256()) {
9035      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
9036
9037      SmallVector<SDValue,32> pshufbMask;
9038      for (unsigned i = 0; i < 2; ++i) {
9039        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
9040        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
9041        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
9042        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
9043        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
9044        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
9045        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
9046        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
9047        for (unsigned j = 0; j < 8; ++j)
9048          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
9049      }
9050      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
9051                               &pshufbMask[0], 32);
9052      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
9053      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
9054
9055      static const int ShufMask[] = {0,  2,  -1,  -1};
9056      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
9057                                &ShufMask[0]);
9058      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
9059                       DAG.getIntPtrConstant(0));
9060      return DAG.getNode(ISD::BITCAST, DL, VT, In);
9061    }
9062
9063    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
9064                               DAG.getIntPtrConstant(0));
9065
9066    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
9067                               DAG.getIntPtrConstant(4));
9068
9069    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
9070    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
9071
9072    // The PSHUFB mask:
9073    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
9074                                   -1, -1, -1, -1, -1, -1, -1, -1};
9075
9076    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
9077    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
9078    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
9079
9080    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
9081    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
9082
9083    // The MOVLHPS Mask:
9084    static const int ShufMask2[] = {0, 1, 4, 5};
9085    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
9086    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
9087  }
9088
9089  // Handle truncation of V256 to V128 using shuffles.
9090  if (!VT.is128BitVector() || !InVT.is256BitVector())
9091    return SDValue();
9092
9093  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
9094
9095  unsigned NumElems = VT.getVectorNumElements();
9096  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
9097                             NumElems * 2);
9098
9099  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
9100  // Prepare truncation shuffle mask
9101  for (unsigned i = 0; i != NumElems; ++i)
9102    MaskVec[i] = i * 2;
9103  SDValue V = DAG.getVectorShuffle(NVT, DL,
9104                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
9105                                   DAG.getUNDEF(NVT), &MaskVec[0]);
9106  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
9107                     DAG.getIntPtrConstant(0));
9108}
9109
9110SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
9111                                           SelectionDAG &DAG) const {
9112  MVT VT = Op.getSimpleValueType();
9113  if (VT.isVector()) {
9114    if (VT == MVT::v8i16)
9115      return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
9116                         DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
9117                                     MVT::v8i32, Op.getOperand(0)));
9118    return SDValue();
9119  }
9120
9121  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
9122    /*IsSigned=*/ true, /*IsReplace=*/ false);
9123  SDValue FIST = Vals.first, StackSlot = Vals.second;
9124  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
9125  if (FIST.getNode() == 0) return Op;
9126
9127  if (StackSlot.getNode())
9128    // Load the result.
9129    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
9130                       FIST, StackSlot, MachinePointerInfo(),
9131                       false, false, false, 0);
9132
9133  // The node is the result.
9134  return FIST;
9135}
9136
9137SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
9138                                           SelectionDAG &DAG) const {
9139  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
9140    /*IsSigned=*/ false, /*IsReplace=*/ false);
9141  SDValue FIST = Vals.first, StackSlot = Vals.second;
9142  assert(FIST.getNode() && "Unexpected failure");
9143
9144  if (StackSlot.getNode())
9145    // Load the result.
9146    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
9147                       FIST, StackSlot, MachinePointerInfo(),
9148                       false, false, false, 0);
9149
9150  // The node is the result.
9151  return FIST;
9152}
9153
9154static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
9155  SDLoc DL(Op);
9156  MVT VT = Op.getSimpleValueType();
9157  SDValue In = Op.getOperand(0);
9158  MVT SVT = In.getSimpleValueType();
9159
9160  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
9161
9162  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
9163                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
9164                                 In, DAG.getUNDEF(SVT)));
9165}
9166
9167SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
9168  LLVMContext *Context = DAG.getContext();
9169  SDLoc dl(Op);
9170  MVT VT = Op.getSimpleValueType();
9171  MVT EltVT = VT;
9172  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
9173  if (VT.isVector()) {
9174    EltVT = VT.getVectorElementType();
9175    NumElts = VT.getVectorNumElements();
9176  }
9177  Constant *C;
9178  if (EltVT == MVT::f64)
9179    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
9180                                          APInt(64, ~(1ULL << 63))));
9181  else
9182    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
9183                                          APInt(32, ~(1U << 31))));
9184  C = ConstantVector::getSplat(NumElts, C);
9185  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
9186  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
9187  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9188                             MachinePointerInfo::getConstantPool(),
9189                             false, false, false, Alignment);
9190  if (VT.isVector()) {
9191    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
9192    return DAG.getNode(ISD::BITCAST, dl, VT,
9193                       DAG.getNode(ISD::AND, dl, ANDVT,
9194                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
9195                                               Op.getOperand(0)),
9196                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
9197  }
9198  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
9199}
9200
9201SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
9202  LLVMContext *Context = DAG.getContext();
9203  SDLoc dl(Op);
9204  MVT VT = Op.getSimpleValueType();
9205  MVT EltVT = VT;
9206  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
9207  if (VT.isVector()) {
9208    EltVT = VT.getVectorElementType();
9209    NumElts = VT.getVectorNumElements();
9210  }
9211  Constant *C;
9212  if (EltVT == MVT::f64)
9213    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
9214                                          APInt(64, 1ULL << 63)));
9215  else
9216    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
9217                                          APInt(32, 1U << 31)));
9218  C = ConstantVector::getSplat(NumElts, C);
9219  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
9220  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
9221  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9222                             MachinePointerInfo::getConstantPool(),
9223                             false, false, false, Alignment);
9224  if (VT.isVector()) {
9225    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
9226    return DAG.getNode(ISD::BITCAST, dl, VT,
9227                       DAG.getNode(ISD::XOR, dl, XORVT,
9228                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
9229                                               Op.getOperand(0)),
9230                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
9231  }
9232
9233  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
9234}
9235
9236SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
9237  LLVMContext *Context = DAG.getContext();
9238  SDValue Op0 = Op.getOperand(0);
9239  SDValue Op1 = Op.getOperand(1);
9240  SDLoc dl(Op);
9241  MVT VT = Op.getSimpleValueType();
9242  MVT SrcVT = Op1.getSimpleValueType();
9243
9244  // If second operand is smaller, extend it first.
9245  if (SrcVT.bitsLT(VT)) {
9246    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
9247    SrcVT = VT;
9248  }
9249  // And if it is bigger, shrink it first.
9250  if (SrcVT.bitsGT(VT)) {
9251    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
9252    SrcVT = VT;
9253  }
9254
9255  // At this point the operands and the result should have the same
9256  // type, and that won't be f80 since that is not custom lowered.
9257
9258  // First get the sign bit of second operand.
9259  SmallVector<Constant*,4> CV;
9260  if (SrcVT == MVT::f64) {
9261    const fltSemantics &Sem = APFloat::IEEEdouble;
9262    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
9263    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
9264  } else {
9265    const fltSemantics &Sem = APFloat::IEEEsingle;
9266    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
9267    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9268    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9269    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9270  }
9271  Constant *C = ConstantVector::get(CV);
9272  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9273  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
9274                              MachinePointerInfo::getConstantPool(),
9275                              false, false, false, 16);
9276  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
9277
9278  // Shift sign bit right or left if the two operands have different types.
9279  if (SrcVT.bitsGT(VT)) {
9280    // Op0 is MVT::f32, Op1 is MVT::f64.
9281    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
9282    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
9283                          DAG.getConstant(32, MVT::i32));
9284    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
9285    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
9286                          DAG.getIntPtrConstant(0));
9287  }
9288
9289  // Clear first operand sign bit.
9290  CV.clear();
9291  if (VT == MVT::f64) {
9292    const fltSemantics &Sem = APFloat::IEEEdouble;
9293    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
9294                                                   APInt(64, ~(1ULL << 63)))));
9295    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
9296  } else {
9297    const fltSemantics &Sem = APFloat::IEEEsingle;
9298    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
9299                                                   APInt(32, ~(1U << 31)))));
9300    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9301    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9302    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
9303  }
9304  C = ConstantVector::get(CV);
9305  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9306  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9307                              MachinePointerInfo::getConstantPool(),
9308                              false, false, false, 16);
9309  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
9310
9311  // Or the value with the sign bit.
9312  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
9313}
9314
9315static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
9316  SDValue N0 = Op.getOperand(0);
9317  SDLoc dl(Op);
9318  MVT VT = Op.getSimpleValueType();
9319
9320  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
9321  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
9322                                  DAG.getConstant(1, VT));
9323  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
9324}
9325
9326// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
9327//
9328static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
9329                                      SelectionDAG &DAG) {
9330  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
9331
9332  if (!Subtarget->hasSSE41())
9333    return SDValue();
9334
9335  if (!Op->hasOneUse())
9336    return SDValue();
9337
9338  SDNode *N = Op.getNode();
9339  SDLoc DL(N);
9340
9341  SmallVector<SDValue, 8> Opnds;
9342  DenseMap<SDValue, unsigned> VecInMap;
9343  EVT VT = MVT::Other;
9344
9345  // Recognize a special case where a vector is casted into wide integer to
9346  // test all 0s.
9347  Opnds.push_back(N->getOperand(0));
9348  Opnds.push_back(N->getOperand(1));
9349
9350  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
9351    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
9352    // BFS traverse all OR'd operands.
9353    if (I->getOpcode() == ISD::OR) {
9354      Opnds.push_back(I->getOperand(0));
9355      Opnds.push_back(I->getOperand(1));
9356      // Re-evaluate the number of nodes to be traversed.
9357      e += 2; // 2 more nodes (LHS and RHS) are pushed.
9358      continue;
9359    }
9360
9361    // Quit if a non-EXTRACT_VECTOR_ELT
9362    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9363      return SDValue();
9364
9365    // Quit if without a constant index.
9366    SDValue Idx = I->getOperand(1);
9367    if (!isa<ConstantSDNode>(Idx))
9368      return SDValue();
9369
9370    SDValue ExtractedFromVec = I->getOperand(0);
9371    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
9372    if (M == VecInMap.end()) {
9373      VT = ExtractedFromVec.getValueType();
9374      // Quit if not 128/256-bit vector.
9375      if (!VT.is128BitVector() && !VT.is256BitVector())
9376        return SDValue();
9377      // Quit if not the same type.
9378      if (VecInMap.begin() != VecInMap.end() &&
9379          VT != VecInMap.begin()->first.getValueType())
9380        return SDValue();
9381      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
9382    }
9383    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
9384  }
9385
9386  assert((VT.is128BitVector() || VT.is256BitVector()) &&
9387         "Not extracted from 128-/256-bit vector.");
9388
9389  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
9390  SmallVector<SDValue, 8> VecIns;
9391
9392  for (DenseMap<SDValue, unsigned>::const_iterator
9393        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
9394    // Quit if not all elements are used.
9395    if (I->second != FullMask)
9396      return SDValue();
9397    VecIns.push_back(I->first);
9398  }
9399
9400  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
9401
9402  // Cast all vectors into TestVT for PTEST.
9403  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
9404    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
9405
9406  // If more than one full vectors are evaluated, OR them first before PTEST.
9407  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
9408    // Each iteration will OR 2 nodes and append the result until there is only
9409    // 1 node left, i.e. the final OR'd value of all vectors.
9410    SDValue LHS = VecIns[Slot];
9411    SDValue RHS = VecIns[Slot + 1];
9412    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
9413  }
9414
9415  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
9416                     VecIns.back(), VecIns.back());
9417}
9418
9419/// Emit nodes that will be selected as "test Op0,Op0", or something
9420/// equivalent.
9421SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
9422                                    SelectionDAG &DAG) const {
9423  SDLoc dl(Op);
9424
9425  // CF and OF aren't always set the way we want. Determine which
9426  // of these we need.
9427  bool NeedCF = false;
9428  bool NeedOF = false;
9429  switch (X86CC) {
9430  default: break;
9431  case X86::COND_A: case X86::COND_AE:
9432  case X86::COND_B: case X86::COND_BE:
9433    NeedCF = true;
9434    break;
9435  case X86::COND_G: case X86::COND_GE:
9436  case X86::COND_L: case X86::COND_LE:
9437  case X86::COND_O: case X86::COND_NO:
9438    NeedOF = true;
9439    break;
9440  }
9441
9442  // See if we can use the EFLAGS value from the operand instead of
9443  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
9444  // we prove that the arithmetic won't overflow, we can't use OF or CF.
9445  if (Op.getResNo() != 0 || NeedOF || NeedCF)
9446    // Emit a CMP with 0, which is the TEST pattern.
9447    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
9448                       DAG.getConstant(0, Op.getValueType()));
9449
9450  unsigned Opcode = 0;
9451  unsigned NumOperands = 0;
9452
9453  // Truncate operations may prevent the merge of the SETCC instruction
9454  // and the arithmetic instruction before it. Attempt to truncate the operands
9455  // of the arithmetic instruction and use a reduced bit-width instruction.
9456  bool NeedTruncation = false;
9457  SDValue ArithOp = Op;
9458  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
9459    SDValue Arith = Op->getOperand(0);
9460    // Both the trunc and the arithmetic op need to have one user each.
9461    if (Arith->hasOneUse())
9462      switch (Arith.getOpcode()) {
9463        default: break;
9464        case ISD::ADD:
9465        case ISD::SUB:
9466        case ISD::AND:
9467        case ISD::OR:
9468        case ISD::XOR: {
9469          NeedTruncation = true;
9470          ArithOp = Arith;
9471        }
9472      }
9473  }
9474
9475  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
9476  // which may be the result of a CAST.  We use the variable 'Op', which is the
9477  // non-casted variable when we check for possible users.
9478  switch (ArithOp.getOpcode()) {
9479  case ISD::ADD:
9480    // Due to an isel shortcoming, be conservative if this add is likely to be
9481    // selected as part of a load-modify-store instruction. When the root node
9482    // in a match is a store, isel doesn't know how to remap non-chain non-flag
9483    // uses of other nodes in the match, such as the ADD in this case. This
9484    // leads to the ADD being left around and reselected, with the result being
9485    // two adds in the output.  Alas, even if none our users are stores, that
9486    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
9487    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
9488    // climbing the DAG back to the root, and it doesn't seem to be worth the
9489    // effort.
9490    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9491         UE = Op.getNode()->use_end(); UI != UE; ++UI)
9492      if (UI->getOpcode() != ISD::CopyToReg &&
9493          UI->getOpcode() != ISD::SETCC &&
9494          UI->getOpcode() != ISD::STORE)
9495        goto default_case;
9496
9497    if (ConstantSDNode *C =
9498        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
9499      // An add of one will be selected as an INC.
9500      if (C->getAPIntValue() == 1) {
9501        Opcode = X86ISD::INC;
9502        NumOperands = 1;
9503        break;
9504      }
9505
9506      // An add of negative one (subtract of one) will be selected as a DEC.
9507      if (C->getAPIntValue().isAllOnesValue()) {
9508        Opcode = X86ISD::DEC;
9509        NumOperands = 1;
9510        break;
9511      }
9512    }
9513
9514    // Otherwise use a regular EFLAGS-setting add.
9515    Opcode = X86ISD::ADD;
9516    NumOperands = 2;
9517    break;
9518  case ISD::AND: {
9519    // If the primary and result isn't used, don't bother using X86ISD::AND,
9520    // because a TEST instruction will be better.
9521    bool NonFlagUse = false;
9522    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9523           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
9524      SDNode *User = *UI;
9525      unsigned UOpNo = UI.getOperandNo();
9526      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
9527        // Look pass truncate.
9528        UOpNo = User->use_begin().getOperandNo();
9529        User = *User->use_begin();
9530      }
9531
9532      if (User->getOpcode() != ISD::BRCOND &&
9533          User->getOpcode() != ISD::SETCC &&
9534          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
9535        NonFlagUse = true;
9536        break;
9537      }
9538    }
9539
9540    if (!NonFlagUse)
9541      break;
9542  }
9543    // FALL THROUGH
9544  case ISD::SUB:
9545  case ISD::OR:
9546  case ISD::XOR:
9547    // Due to the ISEL shortcoming noted above, be conservative if this op is
9548    // likely to be selected as part of a load-modify-store instruction.
9549    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9550           UE = Op.getNode()->use_end(); UI != UE; ++UI)
9551      if (UI->getOpcode() == ISD::STORE)
9552        goto default_case;
9553
9554    // Otherwise use a regular EFLAGS-setting instruction.
9555    switch (ArithOp.getOpcode()) {
9556    default: llvm_unreachable("unexpected operator!");
9557    case ISD::SUB: Opcode = X86ISD::SUB; break;
9558    case ISD::XOR: Opcode = X86ISD::XOR; break;
9559    case ISD::AND: Opcode = X86ISD::AND; break;
9560    case ISD::OR: {
9561      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
9562        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
9563        if (EFLAGS.getNode())
9564          return EFLAGS;
9565      }
9566      Opcode = X86ISD::OR;
9567      break;
9568    }
9569    }
9570
9571    NumOperands = 2;
9572    break;
9573  case X86ISD::ADD:
9574  case X86ISD::SUB:
9575  case X86ISD::INC:
9576  case X86ISD::DEC:
9577  case X86ISD::OR:
9578  case X86ISD::XOR:
9579  case X86ISD::AND:
9580    return SDValue(Op.getNode(), 1);
9581  default:
9582  default_case:
9583    break;
9584  }
9585
9586  // If we found that truncation is beneficial, perform the truncation and
9587  // update 'Op'.
9588  if (NeedTruncation) {
9589    EVT VT = Op.getValueType();
9590    SDValue WideVal = Op->getOperand(0);
9591    EVT WideVT = WideVal.getValueType();
9592    unsigned ConvertedOp = 0;
9593    // Use a target machine opcode to prevent further DAGCombine
9594    // optimizations that may separate the arithmetic operations
9595    // from the setcc node.
9596    switch (WideVal.getOpcode()) {
9597      default: break;
9598      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
9599      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
9600      case ISD::AND: ConvertedOp = X86ISD::AND; break;
9601      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
9602      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
9603    }
9604
9605    if (ConvertedOp) {
9606      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9607      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
9608        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
9609        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
9610        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
9611      }
9612    }
9613  }
9614
9615  if (Opcode == 0)
9616    // Emit a CMP with 0, which is the TEST pattern.
9617    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
9618                       DAG.getConstant(0, Op.getValueType()));
9619
9620  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
9621  SmallVector<SDValue, 4> Ops;
9622  for (unsigned i = 0; i != NumOperands; ++i)
9623    Ops.push_back(Op.getOperand(i));
9624
9625  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
9626  DAG.ReplaceAllUsesWith(Op, New);
9627  return SDValue(New.getNode(), 1);
9628}
9629
9630/// Emit nodes that will be selected as "cmp Op0,Op1", or something
9631/// equivalent.
9632SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
9633                                   SelectionDAG &DAG) const {
9634  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
9635    if (C->getAPIntValue() == 0)
9636      return EmitTest(Op0, X86CC, DAG);
9637
9638  SDLoc dl(Op0);
9639  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
9640       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
9641    // Use SUB instead of CMP to enable CSE between SUB and CMP.
9642    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
9643    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
9644                              Op0, Op1);
9645    return SDValue(Sub.getNode(), 1);
9646  }
9647  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
9648}
9649
9650/// Convert a comparison if required by the subtarget.
9651SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
9652                                                 SelectionDAG &DAG) const {
9653  // If the subtarget does not support the FUCOMI instruction, floating-point
9654  // comparisons have to be converted.
9655  if (Subtarget->hasCMov() ||
9656      Cmp.getOpcode() != X86ISD::CMP ||
9657      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
9658      !Cmp.getOperand(1).getValueType().isFloatingPoint())
9659    return Cmp;
9660
9661  // The instruction selector will select an FUCOM instruction instead of
9662  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
9663  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
9664  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
9665  SDLoc dl(Cmp);
9666  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
9667  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
9668  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
9669                            DAG.getConstant(8, MVT::i8));
9670  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
9671  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
9672}
9673
9674static bool isAllOnes(SDValue V) {
9675  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
9676  return C && C->isAllOnesValue();
9677}
9678
9679/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
9680/// if it's possible.
9681SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
9682                                     SDLoc dl, SelectionDAG &DAG) const {
9683  SDValue Op0 = And.getOperand(0);
9684  SDValue Op1 = And.getOperand(1);
9685  if (Op0.getOpcode() == ISD::TRUNCATE)
9686    Op0 = Op0.getOperand(0);
9687  if (Op1.getOpcode() == ISD::TRUNCATE)
9688    Op1 = Op1.getOperand(0);
9689
9690  SDValue LHS, RHS;
9691  if (Op1.getOpcode() == ISD::SHL)
9692    std::swap(Op0, Op1);
9693  if (Op0.getOpcode() == ISD::SHL) {
9694    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
9695      if (And00C->getZExtValue() == 1) {
9696        // If we looked past a truncate, check that it's only truncating away
9697        // known zeros.
9698        unsigned BitWidth = Op0.getValueSizeInBits();
9699        unsigned AndBitWidth = And.getValueSizeInBits();
9700        if (BitWidth > AndBitWidth) {
9701          APInt Zeros, Ones;
9702          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
9703          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
9704            return SDValue();
9705        }
9706        LHS = Op1;
9707        RHS = Op0.getOperand(1);
9708      }
9709  } else if (Op1.getOpcode() == ISD::Constant) {
9710    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
9711    uint64_t AndRHSVal = AndRHS->getZExtValue();
9712    SDValue AndLHS = Op0;
9713
9714    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
9715      LHS = AndLHS.getOperand(0);
9716      RHS = AndLHS.getOperand(1);
9717    }
9718
9719    // Use BT if the immediate can't be encoded in a TEST instruction.
9720    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
9721      LHS = AndLHS;
9722      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
9723    }
9724  }
9725
9726  if (LHS.getNode()) {
9727    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
9728    // instruction.  Since the shift amount is in-range-or-undefined, we know
9729    // that doing a bittest on the i32 value is ok.  We extend to i32 because
9730    // the encoding for the i16 version is larger than the i32 version.
9731    // Also promote i16 to i32 for performance / code size reason.
9732    if (LHS.getValueType() == MVT::i8 ||
9733        LHS.getValueType() == MVT::i16)
9734      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
9735
9736    // If the operand types disagree, extend the shift amount to match.  Since
9737    // BT ignores high bits (like shifts) we can use anyextend.
9738    if (LHS.getValueType() != RHS.getValueType())
9739      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
9740
9741    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
9742    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
9743    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9744                       DAG.getConstant(Cond, MVT::i8), BT);
9745  }
9746
9747  return SDValue();
9748}
9749
9750/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
9751/// mask CMPs.
9752static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
9753                              SDValue &Op1) {
9754  unsigned SSECC;
9755  bool Swap = false;
9756
9757  // SSE Condition code mapping:
9758  //  0 - EQ
9759  //  1 - LT
9760  //  2 - LE
9761  //  3 - UNORD
9762  //  4 - NEQ
9763  //  5 - NLT
9764  //  6 - NLE
9765  //  7 - ORD
9766  switch (SetCCOpcode) {
9767  default: llvm_unreachable("Unexpected SETCC condition");
9768  case ISD::SETOEQ:
9769  case ISD::SETEQ:  SSECC = 0; break;
9770  case ISD::SETOGT:
9771  case ISD::SETGT:  Swap = true; // Fallthrough
9772  case ISD::SETLT:
9773  case ISD::SETOLT: SSECC = 1; break;
9774  case ISD::SETOGE:
9775  case ISD::SETGE:  Swap = true; // Fallthrough
9776  case ISD::SETLE:
9777  case ISD::SETOLE: SSECC = 2; break;
9778  case ISD::SETUO:  SSECC = 3; break;
9779  case ISD::SETUNE:
9780  case ISD::SETNE:  SSECC = 4; break;
9781  case ISD::SETULE: Swap = true; // Fallthrough
9782  case ISD::SETUGE: SSECC = 5; break;
9783  case ISD::SETULT: Swap = true; // Fallthrough
9784  case ISD::SETUGT: SSECC = 6; break;
9785  case ISD::SETO:   SSECC = 7; break;
9786  case ISD::SETUEQ:
9787  case ISD::SETONE: SSECC = 8; break;
9788  }
9789  if (Swap)
9790    std::swap(Op0, Op1);
9791
9792  return SSECC;
9793}
9794
9795// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
9796// ones, and then concatenate the result back.
9797static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
9798  MVT VT = Op.getSimpleValueType();
9799
9800  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
9801         "Unsupported value type for operation");
9802
9803  unsigned NumElems = VT.getVectorNumElements();
9804  SDLoc dl(Op);
9805  SDValue CC = Op.getOperand(2);
9806
9807  // Extract the LHS vectors
9808  SDValue LHS = Op.getOperand(0);
9809  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
9810  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
9811
9812  // Extract the RHS vectors
9813  SDValue RHS = Op.getOperand(1);
9814  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
9815  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
9816
9817  // Issue the operation on the smaller types and concatenate the result back
9818  MVT EltVT = VT.getVectorElementType();
9819  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
9820  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
9821                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
9822                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
9823}
9824
9825static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
9826  SDValue Op0 = Op.getOperand(0);
9827  SDValue Op1 = Op.getOperand(1);
9828  SDValue CC = Op.getOperand(2);
9829  MVT VT = Op.getSimpleValueType();
9830
9831  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
9832         Op.getValueType().getScalarType() == MVT::i1 &&
9833         "Cannot set masked compare for this operation");
9834
9835  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9836  SDLoc dl(Op);
9837
9838  bool Unsigned = false;
9839  unsigned SSECC;
9840  switch (SetCCOpcode) {
9841  default: llvm_unreachable("Unexpected SETCC condition");
9842  case ISD::SETNE:  SSECC = 4; break;
9843  case ISD::SETEQ:  SSECC = 0; break;
9844  case ISD::SETUGT: Unsigned = true;
9845  case ISD::SETGT:  SSECC = 6; break; // NLE
9846  case ISD::SETULT: Unsigned = true;
9847  case ISD::SETLT:  SSECC = 1; break;
9848  case ISD::SETUGE: Unsigned = true;
9849  case ISD::SETGE:  SSECC = 5; break; // NLT
9850  case ISD::SETULE: Unsigned = true;
9851  case ISD::SETLE:  SSECC = 2; break;
9852  }
9853  unsigned  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
9854  return DAG.getNode(Opc, dl, VT, Op0, Op1,
9855                     DAG.getConstant(SSECC, MVT::i8));
9856
9857}
9858
9859static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
9860                           SelectionDAG &DAG) {
9861  SDValue Op0 = Op.getOperand(0);
9862  SDValue Op1 = Op.getOperand(1);
9863  SDValue CC = Op.getOperand(2);
9864  MVT VT = Op.getSimpleValueType();
9865  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9866  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
9867  SDLoc dl(Op);
9868
9869  if (isFP) {
9870#ifndef NDEBUG
9871    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
9872    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
9873#endif
9874
9875    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
9876    unsigned Opc = X86ISD::CMPP;
9877    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
9878      assert(VT.getVectorNumElements() <= 16);
9879      Opc = X86ISD::CMPM;
9880    }
9881    // In the two special cases we can't handle, emit two comparisons.
9882    if (SSECC == 8) {
9883      unsigned CC0, CC1;
9884      unsigned CombineOpc;
9885      if (SetCCOpcode == ISD::SETUEQ) {
9886        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
9887      } else {
9888        assert(SetCCOpcode == ISD::SETONE);
9889        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
9890      }
9891
9892      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
9893                                 DAG.getConstant(CC0, MVT::i8));
9894      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
9895                                 DAG.getConstant(CC1, MVT::i8));
9896      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
9897    }
9898    // Handle all other FP comparisons here.
9899    return DAG.getNode(Opc, dl, VT, Op0, Op1,
9900                       DAG.getConstant(SSECC, MVT::i8));
9901  }
9902
9903  // Break 256-bit integer vector compare into smaller ones.
9904  if (VT.is256BitVector() && !Subtarget->hasInt256())
9905    return Lower256IntVSETCC(Op, DAG);
9906
9907  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
9908  EVT OpVT = Op1.getValueType();
9909  if (Subtarget->hasAVX512()) {
9910    if (Op1.getValueType().is512BitVector() ||
9911        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
9912      return LowerIntVSETCC_AVX512(Op, DAG);
9913
9914    // In AVX-512 architecture setcc returns mask with i1 elements,
9915    // But there is no compare instruction for i8 and i16 elements.
9916    // We are not talking about 512-bit operands in this case, these
9917    // types are illegal.
9918    if (MaskResult &&
9919        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
9920         OpVT.getVectorElementType().getSizeInBits() >= 8))
9921      return DAG.getNode(ISD::TRUNCATE, dl, VT,
9922                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
9923  }
9924
9925  // We are handling one of the integer comparisons here.  Since SSE only has
9926  // GT and EQ comparisons for integer, swapping operands and multiple
9927  // operations may be required for some comparisons.
9928  unsigned Opc;
9929  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
9930
9931  switch (SetCCOpcode) {
9932  default: llvm_unreachable("Unexpected SETCC condition");
9933  case ISD::SETNE:  Invert = true;
9934  case ISD::SETEQ:  Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
9935  case ISD::SETLT:  Swap = true;
9936  case ISD::SETGT:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
9937  case ISD::SETGE:  Swap = true;
9938  case ISD::SETLE:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
9939                    Invert = true; break;
9940  case ISD::SETULT: Swap = true;
9941  case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
9942                    FlipSigns = true; break;
9943  case ISD::SETUGE: Swap = true;
9944  case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
9945                    FlipSigns = true; Invert = true; break;
9946  }
9947
9948  // Special case: Use min/max operations for SETULE/SETUGE
9949  MVT VET = VT.getVectorElementType();
9950  bool hasMinMax =
9951       (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
9952    || (Subtarget->hasSSE2()  && (VET == MVT::i8));
9953
9954  if (hasMinMax) {
9955    switch (SetCCOpcode) {
9956    default: break;
9957    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
9958    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
9959    }
9960
9961    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
9962  }
9963
9964  if (Swap)
9965    std::swap(Op0, Op1);
9966
9967  // Check that the operation in question is available (most are plain SSE2,
9968  // but PCMPGTQ and PCMPEQQ have different requirements).
9969  if (VT == MVT::v2i64) {
9970    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
9971      assert(Subtarget->hasSSE2() && "Don't know how to lower!");
9972
9973      // First cast everything to the right type.
9974      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
9975      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
9976
9977      // Since SSE has no unsigned integer comparisons, we need to flip the sign
9978      // bits of the inputs before performing those operations. The lower
9979      // compare is always unsigned.
9980      SDValue SB;
9981      if (FlipSigns) {
9982        SB = DAG.getConstant(0x80000000U, MVT::v4i32);
9983      } else {
9984        SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
9985        SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
9986        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
9987                         Sign, Zero, Sign, Zero);
9988      }
9989      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
9990      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
9991
9992      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
9993      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
9994      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
9995
9996      // Create masks for only the low parts/high parts of the 64 bit integers.
9997      static const int MaskHi[] = { 1, 1, 3, 3 };
9998      static const int MaskLo[] = { 0, 0, 2, 2 };
9999      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
10000      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
10001      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
10002
10003      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
10004      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
10005
10006      if (Invert)
10007        Result = DAG.getNOT(dl, Result, MVT::v4i32);
10008
10009      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
10010    }
10011
10012    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
10013      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
10014      // pcmpeqd + pshufd + pand.
10015      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
10016
10017      // First cast everything to the right type.
10018      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
10019      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
10020
10021      // Do the compare.
10022      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
10023
10024      // Make sure the lower and upper halves are both all-ones.
10025      static const int Mask[] = { 1, 0, 3, 2 };
10026      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
10027      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
10028
10029      if (Invert)
10030        Result = DAG.getNOT(dl, Result, MVT::v4i32);
10031
10032      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
10033    }
10034  }
10035
10036  // Since SSE has no unsigned integer comparisons, we need to flip the sign
10037  // bits of the inputs before performing those operations.
10038  if (FlipSigns) {
10039    EVT EltVT = VT.getVectorElementType();
10040    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
10041    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
10042    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
10043  }
10044
10045  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
10046
10047  // If the logical-not of the result is required, perform that now.
10048  if (Invert)
10049    Result = DAG.getNOT(dl, Result, VT);
10050
10051  if (MinMax)
10052    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
10053
10054  return Result;
10055}
10056
10057SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10058
10059  MVT VT = Op.getSimpleValueType();
10060
10061  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
10062
10063  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
10064  SDValue Op0 = Op.getOperand(0);
10065  SDValue Op1 = Op.getOperand(1);
10066  SDLoc dl(Op);
10067  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10068
10069  // Optimize to BT if possible.
10070  // Lower (X & (1 << N)) == 0 to BT(X, N).
10071  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
10072  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
10073  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
10074      Op1.getOpcode() == ISD::Constant &&
10075      cast<ConstantSDNode>(Op1)->isNullValue() &&
10076      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10077    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
10078    if (NewSetCC.getNode())
10079      return NewSetCC;
10080  }
10081
10082  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
10083  // these.
10084  if (Op1.getOpcode() == ISD::Constant &&
10085      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
10086       cast<ConstantSDNode>(Op1)->isNullValue()) &&
10087      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10088
10089    // If the input is a setcc, then reuse the input setcc or use a new one with
10090    // the inverted condition.
10091    if (Op0.getOpcode() == X86ISD::SETCC) {
10092      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
10093      bool Invert = (CC == ISD::SETNE) ^
10094        cast<ConstantSDNode>(Op1)->isNullValue();
10095      if (!Invert) return Op0;
10096
10097      CCode = X86::GetOppositeBranchCondition(CCode);
10098      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10099                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
10100    }
10101  }
10102
10103  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
10104  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
10105  if (X86CC == X86::COND_INVALID)
10106    return SDValue();
10107
10108  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
10109  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
10110  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10111                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
10112}
10113
10114// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
10115static bool isX86LogicalCmp(SDValue Op) {
10116  unsigned Opc = Op.getNode()->getOpcode();
10117  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
10118      Opc == X86ISD::SAHF)
10119    return true;
10120  if (Op.getResNo() == 1 &&
10121      (Opc == X86ISD::ADD ||
10122       Opc == X86ISD::SUB ||
10123       Opc == X86ISD::ADC ||
10124       Opc == X86ISD::SBB ||
10125       Opc == X86ISD::SMUL ||
10126       Opc == X86ISD::UMUL ||
10127       Opc == X86ISD::INC ||
10128       Opc == X86ISD::DEC ||
10129       Opc == X86ISD::OR ||
10130       Opc == X86ISD::XOR ||
10131       Opc == X86ISD::AND))
10132    return true;
10133
10134  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
10135    return true;
10136
10137  return false;
10138}
10139
10140static bool isZero(SDValue V) {
10141  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
10142  return C && C->isNullValue();
10143}
10144
10145static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
10146  if (V.getOpcode() != ISD::TRUNCATE)
10147    return false;
10148
10149  SDValue VOp0 = V.getOperand(0);
10150  unsigned InBits = VOp0.getValueSizeInBits();
10151  unsigned Bits = V.getValueSizeInBits();
10152  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
10153}
10154
10155SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10156  bool addTest = true;
10157  SDValue Cond  = Op.getOperand(0);
10158  SDValue Op1 = Op.getOperand(1);
10159  SDValue Op2 = Op.getOperand(2);
10160  SDLoc DL(Op);
10161  EVT VT = Op1.getValueType();
10162  SDValue CC;
10163
10164  // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
10165  // are available. Otherwise fp cmovs get lowered into a less efficient branch
10166  // sequence later on.
10167  if (Cond.getOpcode() == ISD::SETCC &&
10168      ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
10169       (Subtarget->hasSSE1() && VT == MVT::f32)) &&
10170      VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
10171    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
10172    int SSECC = translateX86FSETCC(
10173        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
10174
10175    if (SSECC != 8) {
10176      unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
10177      SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
10178                                DAG.getConstant(SSECC, MVT::i8));
10179      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
10180      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
10181      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
10182    }
10183  }
10184
10185  if (Cond.getOpcode() == ISD::SETCC) {
10186    SDValue NewCond = LowerSETCC(Cond, DAG);
10187    if (NewCond.getNode())
10188      Cond = NewCond;
10189  }
10190
10191  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
10192  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
10193  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
10194  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
10195  if (Cond.getOpcode() == X86ISD::SETCC &&
10196      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
10197      isZero(Cond.getOperand(1).getOperand(1))) {
10198    SDValue Cmp = Cond.getOperand(1);
10199
10200    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
10201
10202    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
10203        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
10204      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
10205
10206      SDValue CmpOp0 = Cmp.getOperand(0);
10207      // Apply further optimizations for special cases
10208      // (select (x != 0), -1, 0) -> neg & sbb
10209      // (select (x == 0), 0, -1) -> neg & sbb
10210      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
10211        if (YC->isNullValue() &&
10212            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
10213          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
10214          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
10215                                    DAG.getConstant(0, CmpOp0.getValueType()),
10216                                    CmpOp0);
10217          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
10218                                    DAG.getConstant(X86::COND_B, MVT::i8),
10219                                    SDValue(Neg.getNode(), 1));
10220          return Res;
10221        }
10222
10223      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
10224                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
10225      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10226
10227      SDValue Res =   // Res = 0 or -1.
10228        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
10229                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
10230
10231      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
10232        Res = DAG.getNOT(DL, Res, Res.getValueType());
10233
10234      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
10235      if (N2C == 0 || !N2C->isNullValue())
10236        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
10237      return Res;
10238    }
10239  }
10240
10241  // Look past (and (setcc_carry (cmp ...)), 1).
10242  if (Cond.getOpcode() == ISD::AND &&
10243      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
10244    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
10245    if (C && C->getAPIntValue() == 1)
10246      Cond = Cond.getOperand(0);
10247  }
10248
10249  // If condition flag is set by a X86ISD::CMP, then use it as the condition
10250  // setting operand in place of the X86ISD::SETCC.
10251  unsigned CondOpcode = Cond.getOpcode();
10252  if (CondOpcode == X86ISD::SETCC ||
10253      CondOpcode == X86ISD::SETCC_CARRY) {
10254    CC = Cond.getOperand(0);
10255
10256    SDValue Cmp = Cond.getOperand(1);
10257    unsigned Opc = Cmp.getOpcode();
10258    MVT VT = Op.getSimpleValueType();
10259
10260    bool IllegalFPCMov = false;
10261    if (VT.isFloatingPoint() && !VT.isVector() &&
10262        !isScalarFPTypeInSSEReg(VT))  // FPStack?
10263      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
10264
10265    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
10266        Opc == X86ISD::BT) { // FIXME
10267      Cond = Cmp;
10268      addTest = false;
10269    }
10270  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
10271             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
10272             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
10273              Cond.getOperand(0).getValueType() != MVT::i8)) {
10274    SDValue LHS = Cond.getOperand(0);
10275    SDValue RHS = Cond.getOperand(1);
10276    unsigned X86Opcode;
10277    unsigned X86Cond;
10278    SDVTList VTs;
10279    switch (CondOpcode) {
10280    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
10281    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
10282    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
10283    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
10284    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
10285    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
10286    default: llvm_unreachable("unexpected overflowing operator");
10287    }
10288    if (CondOpcode == ISD::UMULO)
10289      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
10290                          MVT::i32);
10291    else
10292      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
10293
10294    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
10295
10296    if (CondOpcode == ISD::UMULO)
10297      Cond = X86Op.getValue(2);
10298    else
10299      Cond = X86Op.getValue(1);
10300
10301    CC = DAG.getConstant(X86Cond, MVT::i8);
10302    addTest = false;
10303  }
10304
10305  if (addTest) {
10306    // Look pass the truncate if the high bits are known zero.
10307    if (isTruncWithZeroHighBitsInput(Cond, DAG))
10308        Cond = Cond.getOperand(0);
10309
10310    // We know the result of AND is compared against zero. Try to match
10311    // it to BT.
10312    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
10313      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
10314      if (NewSetCC.getNode()) {
10315        CC = NewSetCC.getOperand(0);
10316        Cond = NewSetCC.getOperand(1);
10317        addTest = false;
10318      }
10319    }
10320  }
10321
10322  if (addTest) {
10323    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10324    Cond = EmitTest(Cond, X86::COND_NE, DAG);
10325  }
10326
10327  // a <  b ? -1 :  0 -> RES = ~setcc_carry
10328  // a <  b ?  0 : -1 -> RES = setcc_carry
10329  // a >= b ? -1 :  0 -> RES = setcc_carry
10330  // a >= b ?  0 : -1 -> RES = ~setcc_carry
10331  if (Cond.getOpcode() == X86ISD::SUB) {
10332    Cond = ConvertCmpIfNecessary(Cond, DAG);
10333    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
10334
10335    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
10336        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
10337      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
10338                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
10339      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
10340        return DAG.getNOT(DL, Res, Res.getValueType());
10341      return Res;
10342    }
10343  }
10344
10345  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
10346  // widen the cmov and push the truncate through. This avoids introducing a new
10347  // branch during isel and doesn't add any extensions.
10348  if (Op.getValueType() == MVT::i8 &&
10349      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
10350    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
10351    if (T1.getValueType() == T2.getValueType() &&
10352        // Blacklist CopyFromReg to avoid partial register stalls.
10353        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
10354      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
10355      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
10356      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
10357    }
10358  }
10359
10360  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
10361  // condition is true.
10362  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
10363  SDValue Ops[] = { Op2, Op1, CC, Cond };
10364  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
10365}
10366
10367static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
10368  MVT VT = Op->getSimpleValueType(0);
10369  SDValue In = Op->getOperand(0);
10370  MVT InVT = In.getSimpleValueType();
10371  SDLoc dl(Op);
10372
10373  unsigned int NumElts = VT.getVectorNumElements();
10374  if (NumElts != 8 && NumElts != 16)
10375    return SDValue();
10376
10377  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
10378    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
10379
10380  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10381  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
10382
10383  MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
10384  Constant *C = ConstantInt::get(*DAG.getContext(),
10385    APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
10386
10387  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
10388  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
10389  SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
10390                          MachinePointerInfo::getConstantPool(),
10391                          false, false, false, Alignment);
10392  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
10393  if (VT.is512BitVector())
10394    return Brcst;
10395  return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
10396}
10397
10398static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
10399                                SelectionDAG &DAG) {
10400  MVT VT = Op->getSimpleValueType(0);
10401  SDValue In = Op->getOperand(0);
10402  MVT InVT = In.getSimpleValueType();
10403  SDLoc dl(Op);
10404
10405  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
10406    return LowerSIGN_EXTEND_AVX512(Op, DAG);
10407
10408  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
10409      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
10410      (VT != MVT::v16i16 || InVT != MVT::v16i8))
10411    return SDValue();
10412
10413  if (Subtarget->hasInt256())
10414    return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
10415
10416  // Optimize vectors in AVX mode
10417  // Sign extend  v8i16 to v8i32 and
10418  //              v4i32 to v4i64
10419  //
10420  // Divide input vector into two parts
10421  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
10422  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
10423  // concat the vectors to original VT
10424
10425  unsigned NumElems = InVT.getVectorNumElements();
10426  SDValue Undef = DAG.getUNDEF(InVT);
10427
10428  SmallVector<int,8> ShufMask1(NumElems, -1);
10429  for (unsigned i = 0; i != NumElems/2; ++i)
10430    ShufMask1[i] = i;
10431
10432  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
10433
10434  SmallVector<int,8> ShufMask2(NumElems, -1);
10435  for (unsigned i = 0; i != NumElems/2; ++i)
10436    ShufMask2[i] = i + NumElems/2;
10437
10438  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
10439
10440  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
10441                                VT.getVectorNumElements()/2);
10442
10443  OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
10444  OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
10445
10446  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
10447}
10448
10449// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
10450// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
10451// from the AND / OR.
10452static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
10453  Opc = Op.getOpcode();
10454  if (Opc != ISD::OR && Opc != ISD::AND)
10455    return false;
10456  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
10457          Op.getOperand(0).hasOneUse() &&
10458          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
10459          Op.getOperand(1).hasOneUse());
10460}
10461
10462// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
10463// 1 and that the SETCC node has a single use.
10464static bool isXor1OfSetCC(SDValue Op) {
10465  if (Op.getOpcode() != ISD::XOR)
10466    return false;
10467  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
10468  if (N1C && N1C->getAPIntValue() == 1) {
10469    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
10470      Op.getOperand(0).hasOneUse();
10471  }
10472  return false;
10473}
10474
10475SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
10476  bool addTest = true;
10477  SDValue Chain = Op.getOperand(0);
10478  SDValue Cond  = Op.getOperand(1);
10479  SDValue Dest  = Op.getOperand(2);
10480  SDLoc dl(Op);
10481  SDValue CC;
10482  bool Inverted = false;
10483
10484  if (Cond.getOpcode() == ISD::SETCC) {
10485    // Check for setcc([su]{add,sub,mul}o == 0).
10486    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
10487        isa<ConstantSDNode>(Cond.getOperand(1)) &&
10488        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
10489        Cond.getOperand(0).getResNo() == 1 &&
10490        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
10491         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
10492         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
10493         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
10494         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
10495         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
10496      Inverted = true;
10497      Cond = Cond.getOperand(0);
10498    } else {
10499      SDValue NewCond = LowerSETCC(Cond, DAG);
10500      if (NewCond.getNode())
10501        Cond = NewCond;
10502    }
10503  }
10504#if 0
10505  // FIXME: LowerXALUO doesn't handle these!!
10506  else if (Cond.getOpcode() == X86ISD::ADD  ||
10507           Cond.getOpcode() == X86ISD::SUB  ||
10508           Cond.getOpcode() == X86ISD::SMUL ||
10509           Cond.getOpcode() == X86ISD::UMUL)
10510    Cond = LowerXALUO(Cond, DAG);
10511#endif
10512
10513  // Look pass (and (setcc_carry (cmp ...)), 1).
10514  if (Cond.getOpcode() == ISD::AND &&
10515      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
10516    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
10517    if (C && C->getAPIntValue() == 1)
10518      Cond = Cond.getOperand(0);
10519  }
10520
10521  // If condition flag is set by a X86ISD::CMP, then use it as the condition
10522  // setting operand in place of the X86ISD::SETCC.
10523  unsigned CondOpcode = Cond.getOpcode();
10524  if (CondOpcode == X86ISD::SETCC ||
10525      CondOpcode == X86ISD::SETCC_CARRY) {
10526    CC = Cond.getOperand(0);
10527
10528    SDValue Cmp = Cond.getOperand(1);
10529    unsigned Opc = Cmp.getOpcode();
10530    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
10531    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
10532      Cond = Cmp;
10533      addTest = false;
10534    } else {
10535      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
10536      default: break;
10537      case X86::COND_O:
10538      case X86::COND_B:
10539        // These can only come from an arithmetic instruction with overflow,
10540        // e.g. SADDO, UADDO.
10541        Cond = Cond.getNode()->getOperand(1);
10542        addTest = false;
10543        break;
10544      }
10545    }
10546  }
10547  CondOpcode = Cond.getOpcode();
10548  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
10549      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
10550      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
10551       Cond.getOperand(0).getValueType() != MVT::i8)) {
10552    SDValue LHS = Cond.getOperand(0);
10553    SDValue RHS = Cond.getOperand(1);
10554    unsigned X86Opcode;
10555    unsigned X86Cond;
10556    SDVTList VTs;
10557    switch (CondOpcode) {
10558    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
10559    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
10560    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
10561    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
10562    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
10563    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
10564    default: llvm_unreachable("unexpected overflowing operator");
10565    }
10566    if (Inverted)
10567      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
10568    if (CondOpcode == ISD::UMULO)
10569      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
10570                          MVT::i32);
10571    else
10572      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
10573
10574    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
10575
10576    if (CondOpcode == ISD::UMULO)
10577      Cond = X86Op.getValue(2);
10578    else
10579      Cond = X86Op.getValue(1);
10580
10581    CC = DAG.getConstant(X86Cond, MVT::i8);
10582    addTest = false;
10583  } else {
10584    unsigned CondOpc;
10585    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
10586      SDValue Cmp = Cond.getOperand(0).getOperand(1);
10587      if (CondOpc == ISD::OR) {
10588        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
10589        // two branches instead of an explicit OR instruction with a
10590        // separate test.
10591        if (Cmp == Cond.getOperand(1).getOperand(1) &&
10592            isX86LogicalCmp(Cmp)) {
10593          CC = Cond.getOperand(0).getOperand(0);
10594          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10595                              Chain, Dest, CC, Cmp);
10596          CC = Cond.getOperand(1).getOperand(0);
10597          Cond = Cmp;
10598          addTest = false;
10599        }
10600      } else { // ISD::AND
10601        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
10602        // two branches instead of an explicit AND instruction with a
10603        // separate test. However, we only do this if this block doesn't
10604        // have a fall-through edge, because this requires an explicit
10605        // jmp when the condition is false.
10606        if (Cmp == Cond.getOperand(1).getOperand(1) &&
10607            isX86LogicalCmp(Cmp) &&
10608            Op.getNode()->hasOneUse()) {
10609          X86::CondCode CCode =
10610            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
10611          CCode = X86::GetOppositeBranchCondition(CCode);
10612          CC = DAG.getConstant(CCode, MVT::i8);
10613          SDNode *User = *Op.getNode()->use_begin();
10614          // Look for an unconditional branch following this conditional branch.
10615          // We need this because we need to reverse the successors in order
10616          // to implement FCMP_OEQ.
10617          if (User->getOpcode() == ISD::BR) {
10618            SDValue FalseBB = User->getOperand(1);
10619            SDNode *NewBR =
10620              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10621            assert(NewBR == User);
10622            (void)NewBR;
10623            Dest = FalseBB;
10624
10625            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10626                                Chain, Dest, CC, Cmp);
10627            X86::CondCode CCode =
10628              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
10629            CCode = X86::GetOppositeBranchCondition(CCode);
10630            CC = DAG.getConstant(CCode, MVT::i8);
10631            Cond = Cmp;
10632            addTest = false;
10633          }
10634        }
10635      }
10636    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
10637      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
10638      // It should be transformed during dag combiner except when the condition
10639      // is set by a arithmetics with overflow node.
10640      X86::CondCode CCode =
10641        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
10642      CCode = X86::GetOppositeBranchCondition(CCode);
10643      CC = DAG.getConstant(CCode, MVT::i8);
10644      Cond = Cond.getOperand(0).getOperand(1);
10645      addTest = false;
10646    } else if (Cond.getOpcode() == ISD::SETCC &&
10647               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
10648      // For FCMP_OEQ, we can emit
10649      // two branches instead of an explicit AND instruction with a
10650      // separate test. However, we only do this if this block doesn't
10651      // have a fall-through edge, because this requires an explicit
10652      // jmp when the condition is false.
10653      if (Op.getNode()->hasOneUse()) {
10654        SDNode *User = *Op.getNode()->use_begin();
10655        // Look for an unconditional branch following this conditional branch.
10656        // We need this because we need to reverse the successors in order
10657        // to implement FCMP_OEQ.
10658        if (User->getOpcode() == ISD::BR) {
10659          SDValue FalseBB = User->getOperand(1);
10660          SDNode *NewBR =
10661            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10662          assert(NewBR == User);
10663          (void)NewBR;
10664          Dest = FalseBB;
10665
10666          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
10667                                    Cond.getOperand(0), Cond.getOperand(1));
10668          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10669          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10670          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10671                              Chain, Dest, CC, Cmp);
10672          CC = DAG.getConstant(X86::COND_P, MVT::i8);
10673          Cond = Cmp;
10674          addTest = false;
10675        }
10676      }
10677    } else if (Cond.getOpcode() == ISD::SETCC &&
10678               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
10679      // For FCMP_UNE, we can emit
10680      // two branches instead of an explicit AND instruction with a
10681      // separate test. However, we only do this if this block doesn't
10682      // have a fall-through edge, because this requires an explicit
10683      // jmp when the condition is false.
10684      if (Op.getNode()->hasOneUse()) {
10685        SDNode *User = *Op.getNode()->use_begin();
10686        // Look for an unconditional branch following this conditional branch.
10687        // We need this because we need to reverse the successors in order
10688        // to implement FCMP_UNE.
10689        if (User->getOpcode() == ISD::BR) {
10690          SDValue FalseBB = User->getOperand(1);
10691          SDNode *NewBR =
10692            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
10693          assert(NewBR == User);
10694          (void)NewBR;
10695
10696          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
10697                                    Cond.getOperand(0), Cond.getOperand(1));
10698          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
10699          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10700          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10701                              Chain, Dest, CC, Cmp);
10702          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
10703          Cond = Cmp;
10704          addTest = false;
10705          Dest = FalseBB;
10706        }
10707      }
10708    }
10709  }
10710
10711  if (addTest) {
10712    // Look pass the truncate if the high bits are known zero.
10713    if (isTruncWithZeroHighBitsInput(Cond, DAG))
10714        Cond = Cond.getOperand(0);
10715
10716    // We know the result of AND is compared against zero. Try to match
10717    // it to BT.
10718    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
10719      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
10720      if (NewSetCC.getNode()) {
10721        CC = NewSetCC.getOperand(0);
10722        Cond = NewSetCC.getOperand(1);
10723        addTest = false;
10724      }
10725    }
10726  }
10727
10728  if (addTest) {
10729    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
10730    Cond = EmitTest(Cond, X86::COND_NE, DAG);
10731  }
10732  Cond = ConvertCmpIfNecessary(Cond, DAG);
10733  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
10734                     Chain, Dest, CC, Cond);
10735}
10736
10737// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
10738// Calls to _alloca is needed to probe the stack when allocating more than 4k
10739// bytes in one go. Touching the stack at 4K increments is necessary to ensure
10740// that the guard pages used by the OS virtual memory manager are allocated in
10741// correct sequence.
10742SDValue
10743X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
10744                                           SelectionDAG &DAG) const {
10745  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
10746          getTargetMachine().Options.EnableSegmentedStacks) &&
10747         "This should be used only on Windows targets or when segmented stacks "
10748         "are being used");
10749  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
10750  SDLoc dl(Op);
10751
10752  // Get the inputs.
10753  SDValue Chain = Op.getOperand(0);
10754  SDValue Size  = Op.getOperand(1);
10755  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
10756  EVT VT = Op.getNode()->getValueType(0);
10757
10758  bool Is64Bit = Subtarget->is64Bit();
10759  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
10760
10761  if (getTargetMachine().Options.EnableSegmentedStacks) {
10762    MachineFunction &MF = DAG.getMachineFunction();
10763    MachineRegisterInfo &MRI = MF.getRegInfo();
10764
10765    if (Is64Bit) {
10766      // The 64 bit implementation of segmented stacks needs to clobber both r10
10767      // r11. This makes it impossible to use it along with nested parameters.
10768      const Function *F = MF.getFunction();
10769
10770      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
10771           I != E; ++I)
10772        if (I->hasNestAttr())
10773          report_fatal_error("Cannot use segmented stacks with functions that "
10774                             "have nested arguments.");
10775    }
10776
10777    const TargetRegisterClass *AddrRegClass =
10778      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
10779    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
10780    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
10781    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
10782                                DAG.getRegister(Vreg, SPTy));
10783    SDValue Ops1[2] = { Value, Chain };
10784    return DAG.getMergeValues(Ops1, 2, dl);
10785  } else {
10786    SDValue Flag;
10787    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
10788
10789    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
10790    Flag = Chain.getValue(1);
10791    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10792
10793    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
10794
10795    const X86RegisterInfo *RegInfo =
10796      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
10797    unsigned SPReg = RegInfo->getStackRegister();
10798    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
10799    Chain = SP.getValue(1);
10800
10801    if (Align) {
10802      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
10803                       DAG.getConstant(-(uint64_t)Align, VT));
10804      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
10805    }
10806
10807    SDValue Ops1[2] = { SP, Chain };
10808    return DAG.getMergeValues(Ops1, 2, dl);
10809  }
10810}
10811
10812SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
10813  MachineFunction &MF = DAG.getMachineFunction();
10814  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
10815
10816  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10817  SDLoc DL(Op);
10818
10819  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
10820    // vastart just stores the address of the VarArgsFrameIndex slot into the
10821    // memory location argument.
10822    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
10823                                   getPointerTy());
10824    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10825                        MachinePointerInfo(SV), false, false, 0);
10826  }
10827
10828  // __va_list_tag:
10829  //   gp_offset         (0 - 6 * 8)
10830  //   fp_offset         (48 - 48 + 8 * 16)
10831  //   overflow_arg_area (point to parameters coming in memory).
10832  //   reg_save_area
10833  SmallVector<SDValue, 8> MemOps;
10834  SDValue FIN = Op.getOperand(1);
10835  // Store gp_offset
10836  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
10837                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
10838                                               MVT::i32),
10839                               FIN, MachinePointerInfo(SV), false, false, 0);
10840  MemOps.push_back(Store);
10841
10842  // Store fp_offset
10843  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10844                    FIN, DAG.getIntPtrConstant(4));
10845  Store = DAG.getStore(Op.getOperand(0), DL,
10846                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
10847                                       MVT::i32),
10848                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
10849  MemOps.push_back(Store);
10850
10851  // Store ptr to overflow_arg_area
10852  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10853                    FIN, DAG.getIntPtrConstant(4));
10854  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
10855                                    getPointerTy());
10856  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
10857                       MachinePointerInfo(SV, 8),
10858                       false, false, 0);
10859  MemOps.push_back(Store);
10860
10861  // Store ptr to reg_save_area.
10862  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
10863                    FIN, DAG.getIntPtrConstant(8));
10864  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
10865                                    getPointerTy());
10866  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
10867                       MachinePointerInfo(SV, 16), false, false, 0);
10868  MemOps.push_back(Store);
10869  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
10870                     &MemOps[0], MemOps.size());
10871}
10872
10873SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10874  assert(Subtarget->is64Bit() &&
10875         "LowerVAARG only handles 64-bit va_arg!");
10876  assert((Subtarget->isTargetLinux() ||
10877          Subtarget->isTargetDarwin()) &&
10878          "Unhandled target in LowerVAARG");
10879  assert(Op.getNode()->getNumOperands() == 4);
10880  SDValue Chain = Op.getOperand(0);
10881  SDValue SrcPtr = Op.getOperand(1);
10882  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10883  unsigned Align = Op.getConstantOperandVal(3);
10884  SDLoc dl(Op);
10885
10886  EVT ArgVT = Op.getNode()->getValueType(0);
10887  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
10888  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
10889  uint8_t ArgMode;
10890
10891  // Decide which area this value should be read from.
10892  // TODO: Implement the AMD64 ABI in its entirety. This simple
10893  // selection mechanism works only for the basic types.
10894  if (ArgVT == MVT::f80) {
10895    llvm_unreachable("va_arg for f80 not yet implemented");
10896  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
10897    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
10898  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
10899    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
10900  } else {
10901    llvm_unreachable("Unhandled argument type in LowerVAARG");
10902  }
10903
10904  if (ArgMode == 2) {
10905    // Sanity Check: Make sure using fp_offset makes sense.
10906    assert(!getTargetMachine().Options.UseSoftFloat &&
10907           !(DAG.getMachineFunction()
10908                .getFunction()->getAttributes()
10909                .hasAttribute(AttributeSet::FunctionIndex,
10910                              Attribute::NoImplicitFloat)) &&
10911           Subtarget->hasSSE1());
10912  }
10913
10914  // Insert VAARG_64 node into the DAG
10915  // VAARG_64 returns two values: Variable Argument Address, Chain
10916  SmallVector<SDValue, 11> InstOps;
10917  InstOps.push_back(Chain);
10918  InstOps.push_back(SrcPtr);
10919  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
10920  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
10921  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
10922  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
10923  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
10924                                          VTs, &InstOps[0], InstOps.size(),
10925                                          MVT::i64,
10926                                          MachinePointerInfo(SV),
10927                                          /*Align=*/0,
10928                                          /*Volatile=*/false,
10929                                          /*ReadMem=*/true,
10930                                          /*WriteMem=*/true);
10931  Chain = VAARG.getValue(1);
10932
10933  // Load the next argument and return it
10934  return DAG.getLoad(ArgVT, dl,
10935                     Chain,
10936                     VAARG,
10937                     MachinePointerInfo(),
10938                     false, false, false, 0);
10939}
10940
10941static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
10942                           SelectionDAG &DAG) {
10943  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
10944  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
10945  SDValue Chain = Op.getOperand(0);
10946  SDValue DstPtr = Op.getOperand(1);
10947  SDValue SrcPtr = Op.getOperand(2);
10948  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10949  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10950  SDLoc DL(Op);
10951
10952  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
10953                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
10954                       false,
10955                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
10956}
10957
10958// getTargetVShiftByConstNode - Handle vector element shifts where the shift
10959// amount is a constant. Takes immediate version of shift as input.
10960static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
10961                                          SDValue SrcOp, uint64_t ShiftAmt,
10962                                          SelectionDAG &DAG) {
10963
10964  // Check for ShiftAmt >= element width
10965  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
10966    if (Opc == X86ISD::VSRAI)
10967      ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
10968    else
10969      return DAG.getConstant(0, VT);
10970  }
10971
10972  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
10973         && "Unknown target vector shift-by-constant node");
10974
10975  return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
10976}
10977
10978// getTargetVShiftNode - Handle vector element shifts where the shift amount
10979// may or may not be a constant. Takes immediate version of shift as input.
10980static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
10981                                   SDValue SrcOp, SDValue ShAmt,
10982                                   SelectionDAG &DAG) {
10983  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
10984
10985  // Catch shift-by-constant.
10986  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
10987    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
10988                                      CShAmt->getZExtValue(), DAG);
10989
10990  // Change opcode to non-immediate version
10991  switch (Opc) {
10992    default: llvm_unreachable("Unknown target vector shift node");
10993    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
10994    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
10995    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
10996  }
10997
10998  // Need to build a vector containing shift amount
10999  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
11000  SDValue ShOps[4];
11001  ShOps[0] = ShAmt;
11002  ShOps[1] = DAG.getConstant(0, MVT::i32);
11003  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
11004  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
11005
11006  // The return type has to be a 128-bit type with the same element
11007  // type as the input type.
11008  MVT EltVT = VT.getVectorElementType().getSimpleVT();
11009  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
11010
11011  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
11012  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
11013}
11014
11015static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
11016  SDLoc dl(Op);
11017  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11018  switch (IntNo) {
11019  default: return SDValue();    // Don't custom lower most intrinsics.
11020  // Comparison intrinsics.
11021  case Intrinsic::x86_sse_comieq_ss:
11022  case Intrinsic::x86_sse_comilt_ss:
11023  case Intrinsic::x86_sse_comile_ss:
11024  case Intrinsic::x86_sse_comigt_ss:
11025  case Intrinsic::x86_sse_comige_ss:
11026  case Intrinsic::x86_sse_comineq_ss:
11027  case Intrinsic::x86_sse_ucomieq_ss:
11028  case Intrinsic::x86_sse_ucomilt_ss:
11029  case Intrinsic::x86_sse_ucomile_ss:
11030  case Intrinsic::x86_sse_ucomigt_ss:
11031  case Intrinsic::x86_sse_ucomige_ss:
11032  case Intrinsic::x86_sse_ucomineq_ss:
11033  case Intrinsic::x86_sse2_comieq_sd:
11034  case Intrinsic::x86_sse2_comilt_sd:
11035  case Intrinsic::x86_sse2_comile_sd:
11036  case Intrinsic::x86_sse2_comigt_sd:
11037  case Intrinsic::x86_sse2_comige_sd:
11038  case Intrinsic::x86_sse2_comineq_sd:
11039  case Intrinsic::x86_sse2_ucomieq_sd:
11040  case Intrinsic::x86_sse2_ucomilt_sd:
11041  case Intrinsic::x86_sse2_ucomile_sd:
11042  case Intrinsic::x86_sse2_ucomigt_sd:
11043  case Intrinsic::x86_sse2_ucomige_sd:
11044  case Intrinsic::x86_sse2_ucomineq_sd: {
11045    unsigned Opc;
11046    ISD::CondCode CC;
11047    switch (IntNo) {
11048    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11049    case Intrinsic::x86_sse_comieq_ss:
11050    case Intrinsic::x86_sse2_comieq_sd:
11051      Opc = X86ISD::COMI;
11052      CC = ISD::SETEQ;
11053      break;
11054    case Intrinsic::x86_sse_comilt_ss:
11055    case Intrinsic::x86_sse2_comilt_sd:
11056      Opc = X86ISD::COMI;
11057      CC = ISD::SETLT;
11058      break;
11059    case Intrinsic::x86_sse_comile_ss:
11060    case Intrinsic::x86_sse2_comile_sd:
11061      Opc = X86ISD::COMI;
11062      CC = ISD::SETLE;
11063      break;
11064    case Intrinsic::x86_sse_comigt_ss:
11065    case Intrinsic::x86_sse2_comigt_sd:
11066      Opc = X86ISD::COMI;
11067      CC = ISD::SETGT;
11068      break;
11069    case Intrinsic::x86_sse_comige_ss:
11070    case Intrinsic::x86_sse2_comige_sd:
11071      Opc = X86ISD::COMI;
11072      CC = ISD::SETGE;
11073      break;
11074    case Intrinsic::x86_sse_comineq_ss:
11075    case Intrinsic::x86_sse2_comineq_sd:
11076      Opc = X86ISD::COMI;
11077      CC = ISD::SETNE;
11078      break;
11079    case Intrinsic::x86_sse_ucomieq_ss:
11080    case Intrinsic::x86_sse2_ucomieq_sd:
11081      Opc = X86ISD::UCOMI;
11082      CC = ISD::SETEQ;
11083      break;
11084    case Intrinsic::x86_sse_ucomilt_ss:
11085    case Intrinsic::x86_sse2_ucomilt_sd:
11086      Opc = X86ISD::UCOMI;
11087      CC = ISD::SETLT;
11088      break;
11089    case Intrinsic::x86_sse_ucomile_ss:
11090    case Intrinsic::x86_sse2_ucomile_sd:
11091      Opc = X86ISD::UCOMI;
11092      CC = ISD::SETLE;
11093      break;
11094    case Intrinsic::x86_sse_ucomigt_ss:
11095    case Intrinsic::x86_sse2_ucomigt_sd:
11096      Opc = X86ISD::UCOMI;
11097      CC = ISD::SETGT;
11098      break;
11099    case Intrinsic::x86_sse_ucomige_ss:
11100    case Intrinsic::x86_sse2_ucomige_sd:
11101      Opc = X86ISD::UCOMI;
11102      CC = ISD::SETGE;
11103      break;
11104    case Intrinsic::x86_sse_ucomineq_ss:
11105    case Intrinsic::x86_sse2_ucomineq_sd:
11106      Opc = X86ISD::UCOMI;
11107      CC = ISD::SETNE;
11108      break;
11109    }
11110
11111    SDValue LHS = Op.getOperand(1);
11112    SDValue RHS = Op.getOperand(2);
11113    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
11114    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
11115    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
11116    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11117                                DAG.getConstant(X86CC, MVT::i8), Cond);
11118    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11119  }
11120
11121  // Arithmetic intrinsics.
11122  case Intrinsic::x86_sse2_pmulu_dq:
11123  case Intrinsic::x86_avx2_pmulu_dq:
11124    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
11125                       Op.getOperand(1), Op.getOperand(2));
11126
11127  // SSE2/AVX2 sub with unsigned saturation intrinsics
11128  case Intrinsic::x86_sse2_psubus_b:
11129  case Intrinsic::x86_sse2_psubus_w:
11130  case Intrinsic::x86_avx2_psubus_b:
11131  case Intrinsic::x86_avx2_psubus_w:
11132    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
11133                       Op.getOperand(1), Op.getOperand(2));
11134
11135  // SSE3/AVX horizontal add/sub intrinsics
11136  case Intrinsic::x86_sse3_hadd_ps:
11137  case Intrinsic::x86_sse3_hadd_pd:
11138  case Intrinsic::x86_avx_hadd_ps_256:
11139  case Intrinsic::x86_avx_hadd_pd_256:
11140  case Intrinsic::x86_sse3_hsub_ps:
11141  case Intrinsic::x86_sse3_hsub_pd:
11142  case Intrinsic::x86_avx_hsub_ps_256:
11143  case Intrinsic::x86_avx_hsub_pd_256:
11144  case Intrinsic::x86_ssse3_phadd_w_128:
11145  case Intrinsic::x86_ssse3_phadd_d_128:
11146  case Intrinsic::x86_avx2_phadd_w:
11147  case Intrinsic::x86_avx2_phadd_d:
11148  case Intrinsic::x86_ssse3_phsub_w_128:
11149  case Intrinsic::x86_ssse3_phsub_d_128:
11150  case Intrinsic::x86_avx2_phsub_w:
11151  case Intrinsic::x86_avx2_phsub_d: {
11152    unsigned Opcode;
11153    switch (IntNo) {
11154    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11155    case Intrinsic::x86_sse3_hadd_ps:
11156    case Intrinsic::x86_sse3_hadd_pd:
11157    case Intrinsic::x86_avx_hadd_ps_256:
11158    case Intrinsic::x86_avx_hadd_pd_256:
11159      Opcode = X86ISD::FHADD;
11160      break;
11161    case Intrinsic::x86_sse3_hsub_ps:
11162    case Intrinsic::x86_sse3_hsub_pd:
11163    case Intrinsic::x86_avx_hsub_ps_256:
11164    case Intrinsic::x86_avx_hsub_pd_256:
11165      Opcode = X86ISD::FHSUB;
11166      break;
11167    case Intrinsic::x86_ssse3_phadd_w_128:
11168    case Intrinsic::x86_ssse3_phadd_d_128:
11169    case Intrinsic::x86_avx2_phadd_w:
11170    case Intrinsic::x86_avx2_phadd_d:
11171      Opcode = X86ISD::HADD;
11172      break;
11173    case Intrinsic::x86_ssse3_phsub_w_128:
11174    case Intrinsic::x86_ssse3_phsub_d_128:
11175    case Intrinsic::x86_avx2_phsub_w:
11176    case Intrinsic::x86_avx2_phsub_d:
11177      Opcode = X86ISD::HSUB;
11178      break;
11179    }
11180    return DAG.getNode(Opcode, dl, Op.getValueType(),
11181                       Op.getOperand(1), Op.getOperand(2));
11182  }
11183
11184  // SSE2/SSE41/AVX2 integer max/min intrinsics.
11185  case Intrinsic::x86_sse2_pmaxu_b:
11186  case Intrinsic::x86_sse41_pmaxuw:
11187  case Intrinsic::x86_sse41_pmaxud:
11188  case Intrinsic::x86_avx2_pmaxu_b:
11189  case Intrinsic::x86_avx2_pmaxu_w:
11190  case Intrinsic::x86_avx2_pmaxu_d:
11191  case Intrinsic::x86_avx512_pmaxu_d:
11192  case Intrinsic::x86_avx512_pmaxu_q:
11193  case Intrinsic::x86_sse2_pminu_b:
11194  case Intrinsic::x86_sse41_pminuw:
11195  case Intrinsic::x86_sse41_pminud:
11196  case Intrinsic::x86_avx2_pminu_b:
11197  case Intrinsic::x86_avx2_pminu_w:
11198  case Intrinsic::x86_avx2_pminu_d:
11199  case Intrinsic::x86_avx512_pminu_d:
11200  case Intrinsic::x86_avx512_pminu_q:
11201  case Intrinsic::x86_sse41_pmaxsb:
11202  case Intrinsic::x86_sse2_pmaxs_w:
11203  case Intrinsic::x86_sse41_pmaxsd:
11204  case Intrinsic::x86_avx2_pmaxs_b:
11205  case Intrinsic::x86_avx2_pmaxs_w:
11206  case Intrinsic::x86_avx2_pmaxs_d:
11207  case Intrinsic::x86_avx512_pmaxs_d:
11208  case Intrinsic::x86_avx512_pmaxs_q:
11209  case Intrinsic::x86_sse41_pminsb:
11210  case Intrinsic::x86_sse2_pmins_w:
11211  case Intrinsic::x86_sse41_pminsd:
11212  case Intrinsic::x86_avx2_pmins_b:
11213  case Intrinsic::x86_avx2_pmins_w:
11214  case Intrinsic::x86_avx2_pmins_d:
11215  case Intrinsic::x86_avx512_pmins_d:
11216  case Intrinsic::x86_avx512_pmins_q: {
11217    unsigned Opcode;
11218    switch (IntNo) {
11219    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11220    case Intrinsic::x86_sse2_pmaxu_b:
11221    case Intrinsic::x86_sse41_pmaxuw:
11222    case Intrinsic::x86_sse41_pmaxud:
11223    case Intrinsic::x86_avx2_pmaxu_b:
11224    case Intrinsic::x86_avx2_pmaxu_w:
11225    case Intrinsic::x86_avx2_pmaxu_d:
11226    case Intrinsic::x86_avx512_pmaxu_d:
11227    case Intrinsic::x86_avx512_pmaxu_q:
11228      Opcode = X86ISD::UMAX;
11229      break;
11230    case Intrinsic::x86_sse2_pminu_b:
11231    case Intrinsic::x86_sse41_pminuw:
11232    case Intrinsic::x86_sse41_pminud:
11233    case Intrinsic::x86_avx2_pminu_b:
11234    case Intrinsic::x86_avx2_pminu_w:
11235    case Intrinsic::x86_avx2_pminu_d:
11236    case Intrinsic::x86_avx512_pminu_d:
11237    case Intrinsic::x86_avx512_pminu_q:
11238      Opcode = X86ISD::UMIN;
11239      break;
11240    case Intrinsic::x86_sse41_pmaxsb:
11241    case Intrinsic::x86_sse2_pmaxs_w:
11242    case Intrinsic::x86_sse41_pmaxsd:
11243    case Intrinsic::x86_avx2_pmaxs_b:
11244    case Intrinsic::x86_avx2_pmaxs_w:
11245    case Intrinsic::x86_avx2_pmaxs_d:
11246    case Intrinsic::x86_avx512_pmaxs_d:
11247    case Intrinsic::x86_avx512_pmaxs_q:
11248      Opcode = X86ISD::SMAX;
11249      break;
11250    case Intrinsic::x86_sse41_pminsb:
11251    case Intrinsic::x86_sse2_pmins_w:
11252    case Intrinsic::x86_sse41_pminsd:
11253    case Intrinsic::x86_avx2_pmins_b:
11254    case Intrinsic::x86_avx2_pmins_w:
11255    case Intrinsic::x86_avx2_pmins_d:
11256    case Intrinsic::x86_avx512_pmins_d:
11257    case Intrinsic::x86_avx512_pmins_q:
11258      Opcode = X86ISD::SMIN;
11259      break;
11260    }
11261    return DAG.getNode(Opcode, dl, Op.getValueType(),
11262                       Op.getOperand(1), Op.getOperand(2));
11263  }
11264
11265  // SSE/SSE2/AVX floating point max/min intrinsics.
11266  case Intrinsic::x86_sse_max_ps:
11267  case Intrinsic::x86_sse2_max_pd:
11268  case Intrinsic::x86_avx_max_ps_256:
11269  case Intrinsic::x86_avx_max_pd_256:
11270  case Intrinsic::x86_avx512_max_ps_512:
11271  case Intrinsic::x86_avx512_max_pd_512:
11272  case Intrinsic::x86_sse_min_ps:
11273  case Intrinsic::x86_sse2_min_pd:
11274  case Intrinsic::x86_avx_min_ps_256:
11275  case Intrinsic::x86_avx_min_pd_256:
11276  case Intrinsic::x86_avx512_min_ps_512:
11277  case Intrinsic::x86_avx512_min_pd_512:  {
11278    unsigned Opcode;
11279    switch (IntNo) {
11280    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11281    case Intrinsic::x86_sse_max_ps:
11282    case Intrinsic::x86_sse2_max_pd:
11283    case Intrinsic::x86_avx_max_ps_256:
11284    case Intrinsic::x86_avx_max_pd_256:
11285    case Intrinsic::x86_avx512_max_ps_512:
11286    case Intrinsic::x86_avx512_max_pd_512:
11287      Opcode = X86ISD::FMAX;
11288      break;
11289    case Intrinsic::x86_sse_min_ps:
11290    case Intrinsic::x86_sse2_min_pd:
11291    case Intrinsic::x86_avx_min_ps_256:
11292    case Intrinsic::x86_avx_min_pd_256:
11293    case Intrinsic::x86_avx512_min_ps_512:
11294    case Intrinsic::x86_avx512_min_pd_512:
11295      Opcode = X86ISD::FMIN;
11296      break;
11297    }
11298    return DAG.getNode(Opcode, dl, Op.getValueType(),
11299                       Op.getOperand(1), Op.getOperand(2));
11300  }
11301
11302  // AVX2 variable shift intrinsics
11303  case Intrinsic::x86_avx2_psllv_d:
11304  case Intrinsic::x86_avx2_psllv_q:
11305  case Intrinsic::x86_avx2_psllv_d_256:
11306  case Intrinsic::x86_avx2_psllv_q_256:
11307  case Intrinsic::x86_avx2_psrlv_d:
11308  case Intrinsic::x86_avx2_psrlv_q:
11309  case Intrinsic::x86_avx2_psrlv_d_256:
11310  case Intrinsic::x86_avx2_psrlv_q_256:
11311  case Intrinsic::x86_avx2_psrav_d:
11312  case Intrinsic::x86_avx2_psrav_d_256: {
11313    unsigned Opcode;
11314    switch (IntNo) {
11315    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11316    case Intrinsic::x86_avx2_psllv_d:
11317    case Intrinsic::x86_avx2_psllv_q:
11318    case Intrinsic::x86_avx2_psllv_d_256:
11319    case Intrinsic::x86_avx2_psllv_q_256:
11320      Opcode = ISD::SHL;
11321      break;
11322    case Intrinsic::x86_avx2_psrlv_d:
11323    case Intrinsic::x86_avx2_psrlv_q:
11324    case Intrinsic::x86_avx2_psrlv_d_256:
11325    case Intrinsic::x86_avx2_psrlv_q_256:
11326      Opcode = ISD::SRL;
11327      break;
11328    case Intrinsic::x86_avx2_psrav_d:
11329    case Intrinsic::x86_avx2_psrav_d_256:
11330      Opcode = ISD::SRA;
11331      break;
11332    }
11333    return DAG.getNode(Opcode, dl, Op.getValueType(),
11334                       Op.getOperand(1), Op.getOperand(2));
11335  }
11336
11337  case Intrinsic::x86_ssse3_pshuf_b_128:
11338  case Intrinsic::x86_avx2_pshuf_b:
11339    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
11340                       Op.getOperand(1), Op.getOperand(2));
11341
11342  case Intrinsic::x86_ssse3_psign_b_128:
11343  case Intrinsic::x86_ssse3_psign_w_128:
11344  case Intrinsic::x86_ssse3_psign_d_128:
11345  case Intrinsic::x86_avx2_psign_b:
11346  case Intrinsic::x86_avx2_psign_w:
11347  case Intrinsic::x86_avx2_psign_d:
11348    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
11349                       Op.getOperand(1), Op.getOperand(2));
11350
11351  case Intrinsic::x86_sse41_insertps:
11352    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
11353                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
11354
11355  case Intrinsic::x86_avx_vperm2f128_ps_256:
11356  case Intrinsic::x86_avx_vperm2f128_pd_256:
11357  case Intrinsic::x86_avx_vperm2f128_si_256:
11358  case Intrinsic::x86_avx2_vperm2i128:
11359    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
11360                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
11361
11362  case Intrinsic::x86_avx2_permd:
11363  case Intrinsic::x86_avx2_permps:
11364    // Operands intentionally swapped. Mask is last operand to intrinsic,
11365    // but second operand for node/instruction.
11366    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
11367                       Op.getOperand(2), Op.getOperand(1));
11368
11369  case Intrinsic::x86_sse_sqrt_ps:
11370  case Intrinsic::x86_sse2_sqrt_pd:
11371  case Intrinsic::x86_avx_sqrt_ps_256:
11372  case Intrinsic::x86_avx_sqrt_pd_256:
11373    return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
11374
11375  // ptest and testp intrinsics. The intrinsic these come from are designed to
11376  // return an integer value, not just an instruction so lower it to the ptest
11377  // or testp pattern and a setcc for the result.
11378  case Intrinsic::x86_sse41_ptestz:
11379  case Intrinsic::x86_sse41_ptestc:
11380  case Intrinsic::x86_sse41_ptestnzc:
11381  case Intrinsic::x86_avx_ptestz_256:
11382  case Intrinsic::x86_avx_ptestc_256:
11383  case Intrinsic::x86_avx_ptestnzc_256:
11384  case Intrinsic::x86_avx_vtestz_ps:
11385  case Intrinsic::x86_avx_vtestc_ps:
11386  case Intrinsic::x86_avx_vtestnzc_ps:
11387  case Intrinsic::x86_avx_vtestz_pd:
11388  case Intrinsic::x86_avx_vtestc_pd:
11389  case Intrinsic::x86_avx_vtestnzc_pd:
11390  case Intrinsic::x86_avx_vtestz_ps_256:
11391  case Intrinsic::x86_avx_vtestc_ps_256:
11392  case Intrinsic::x86_avx_vtestnzc_ps_256:
11393  case Intrinsic::x86_avx_vtestz_pd_256:
11394  case Intrinsic::x86_avx_vtestc_pd_256:
11395  case Intrinsic::x86_avx_vtestnzc_pd_256: {
11396    bool IsTestPacked = false;
11397    unsigned X86CC;
11398    switch (IntNo) {
11399    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
11400    case Intrinsic::x86_avx_vtestz_ps:
11401    case Intrinsic::x86_avx_vtestz_pd:
11402    case Intrinsic::x86_avx_vtestz_ps_256:
11403    case Intrinsic::x86_avx_vtestz_pd_256:
11404      IsTestPacked = true; // Fallthrough
11405    case Intrinsic::x86_sse41_ptestz:
11406    case Intrinsic::x86_avx_ptestz_256:
11407      // ZF = 1
11408      X86CC = X86::COND_E;
11409      break;
11410    case Intrinsic::x86_avx_vtestc_ps:
11411    case Intrinsic::x86_avx_vtestc_pd:
11412    case Intrinsic::x86_avx_vtestc_ps_256:
11413    case Intrinsic::x86_avx_vtestc_pd_256:
11414      IsTestPacked = true; // Fallthrough
11415    case Intrinsic::x86_sse41_ptestc:
11416    case Intrinsic::x86_avx_ptestc_256:
11417      // CF = 1
11418      X86CC = X86::COND_B;
11419      break;
11420    case Intrinsic::x86_avx_vtestnzc_ps:
11421    case Intrinsic::x86_avx_vtestnzc_pd:
11422    case Intrinsic::x86_avx_vtestnzc_ps_256:
11423    case Intrinsic::x86_avx_vtestnzc_pd_256:
11424      IsTestPacked = true; // Fallthrough
11425    case Intrinsic::x86_sse41_ptestnzc:
11426    case Intrinsic::x86_avx_ptestnzc_256:
11427      // ZF and CF = 0
11428      X86CC = X86::COND_A;
11429      break;
11430    }
11431
11432    SDValue LHS = Op.getOperand(1);
11433    SDValue RHS = Op.getOperand(2);
11434    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
11435    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
11436    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
11437    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
11438    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11439  }
11440  case Intrinsic::x86_avx512_kortestz:
11441  case Intrinsic::x86_avx512_kortestc: {
11442    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
11443    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
11444    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
11445    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
11446    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
11447    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
11448    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11449  }
11450
11451  // SSE/AVX shift intrinsics
11452  case Intrinsic::x86_sse2_psll_w:
11453  case Intrinsic::x86_sse2_psll_d:
11454  case Intrinsic::x86_sse2_psll_q:
11455  case Intrinsic::x86_avx2_psll_w:
11456  case Intrinsic::x86_avx2_psll_d:
11457  case Intrinsic::x86_avx2_psll_q:
11458  case Intrinsic::x86_sse2_psrl_w:
11459  case Intrinsic::x86_sse2_psrl_d:
11460  case Intrinsic::x86_sse2_psrl_q:
11461  case Intrinsic::x86_avx2_psrl_w:
11462  case Intrinsic::x86_avx2_psrl_d:
11463  case Intrinsic::x86_avx2_psrl_q:
11464  case Intrinsic::x86_sse2_psra_w:
11465  case Intrinsic::x86_sse2_psra_d:
11466  case Intrinsic::x86_avx2_psra_w:
11467  case Intrinsic::x86_avx2_psra_d: {
11468    unsigned Opcode;
11469    switch (IntNo) {
11470    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11471    case Intrinsic::x86_sse2_psll_w:
11472    case Intrinsic::x86_sse2_psll_d:
11473    case Intrinsic::x86_sse2_psll_q:
11474    case Intrinsic::x86_avx2_psll_w:
11475    case Intrinsic::x86_avx2_psll_d:
11476    case Intrinsic::x86_avx2_psll_q:
11477      Opcode = X86ISD::VSHL;
11478      break;
11479    case Intrinsic::x86_sse2_psrl_w:
11480    case Intrinsic::x86_sse2_psrl_d:
11481    case Intrinsic::x86_sse2_psrl_q:
11482    case Intrinsic::x86_avx2_psrl_w:
11483    case Intrinsic::x86_avx2_psrl_d:
11484    case Intrinsic::x86_avx2_psrl_q:
11485      Opcode = X86ISD::VSRL;
11486      break;
11487    case Intrinsic::x86_sse2_psra_w:
11488    case Intrinsic::x86_sse2_psra_d:
11489    case Intrinsic::x86_avx2_psra_w:
11490    case Intrinsic::x86_avx2_psra_d:
11491      Opcode = X86ISD::VSRA;
11492      break;
11493    }
11494    return DAG.getNode(Opcode, dl, Op.getValueType(),
11495                       Op.getOperand(1), Op.getOperand(2));
11496  }
11497
11498  // SSE/AVX immediate shift intrinsics
11499  case Intrinsic::x86_sse2_pslli_w:
11500  case Intrinsic::x86_sse2_pslli_d:
11501  case Intrinsic::x86_sse2_pslli_q:
11502  case Intrinsic::x86_avx2_pslli_w:
11503  case Intrinsic::x86_avx2_pslli_d:
11504  case Intrinsic::x86_avx2_pslli_q:
11505  case Intrinsic::x86_sse2_psrli_w:
11506  case Intrinsic::x86_sse2_psrli_d:
11507  case Intrinsic::x86_sse2_psrli_q:
11508  case Intrinsic::x86_avx2_psrli_w:
11509  case Intrinsic::x86_avx2_psrli_d:
11510  case Intrinsic::x86_avx2_psrli_q:
11511  case Intrinsic::x86_sse2_psrai_w:
11512  case Intrinsic::x86_sse2_psrai_d:
11513  case Intrinsic::x86_avx2_psrai_w:
11514  case Intrinsic::x86_avx2_psrai_d: {
11515    unsigned Opcode;
11516    switch (IntNo) {
11517    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11518    case Intrinsic::x86_sse2_pslli_w:
11519    case Intrinsic::x86_sse2_pslli_d:
11520    case Intrinsic::x86_sse2_pslli_q:
11521    case Intrinsic::x86_avx2_pslli_w:
11522    case Intrinsic::x86_avx2_pslli_d:
11523    case Intrinsic::x86_avx2_pslli_q:
11524      Opcode = X86ISD::VSHLI;
11525      break;
11526    case Intrinsic::x86_sse2_psrli_w:
11527    case Intrinsic::x86_sse2_psrli_d:
11528    case Intrinsic::x86_sse2_psrli_q:
11529    case Intrinsic::x86_avx2_psrli_w:
11530    case Intrinsic::x86_avx2_psrli_d:
11531    case Intrinsic::x86_avx2_psrli_q:
11532      Opcode = X86ISD::VSRLI;
11533      break;
11534    case Intrinsic::x86_sse2_psrai_w:
11535    case Intrinsic::x86_sse2_psrai_d:
11536    case Intrinsic::x86_avx2_psrai_w:
11537    case Intrinsic::x86_avx2_psrai_d:
11538      Opcode = X86ISD::VSRAI;
11539      break;
11540    }
11541    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
11542                               Op.getOperand(1), Op.getOperand(2), DAG);
11543  }
11544
11545  case Intrinsic::x86_sse42_pcmpistria128:
11546  case Intrinsic::x86_sse42_pcmpestria128:
11547  case Intrinsic::x86_sse42_pcmpistric128:
11548  case Intrinsic::x86_sse42_pcmpestric128:
11549  case Intrinsic::x86_sse42_pcmpistrio128:
11550  case Intrinsic::x86_sse42_pcmpestrio128:
11551  case Intrinsic::x86_sse42_pcmpistris128:
11552  case Intrinsic::x86_sse42_pcmpestris128:
11553  case Intrinsic::x86_sse42_pcmpistriz128:
11554  case Intrinsic::x86_sse42_pcmpestriz128: {
11555    unsigned Opcode;
11556    unsigned X86CC;
11557    switch (IntNo) {
11558    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11559    case Intrinsic::x86_sse42_pcmpistria128:
11560      Opcode = X86ISD::PCMPISTRI;
11561      X86CC = X86::COND_A;
11562      break;
11563    case Intrinsic::x86_sse42_pcmpestria128:
11564      Opcode = X86ISD::PCMPESTRI;
11565      X86CC = X86::COND_A;
11566      break;
11567    case Intrinsic::x86_sse42_pcmpistric128:
11568      Opcode = X86ISD::PCMPISTRI;
11569      X86CC = X86::COND_B;
11570      break;
11571    case Intrinsic::x86_sse42_pcmpestric128:
11572      Opcode = X86ISD::PCMPESTRI;
11573      X86CC = X86::COND_B;
11574      break;
11575    case Intrinsic::x86_sse42_pcmpistrio128:
11576      Opcode = X86ISD::PCMPISTRI;
11577      X86CC = X86::COND_O;
11578      break;
11579    case Intrinsic::x86_sse42_pcmpestrio128:
11580      Opcode = X86ISD::PCMPESTRI;
11581      X86CC = X86::COND_O;
11582      break;
11583    case Intrinsic::x86_sse42_pcmpistris128:
11584      Opcode = X86ISD::PCMPISTRI;
11585      X86CC = X86::COND_S;
11586      break;
11587    case Intrinsic::x86_sse42_pcmpestris128:
11588      Opcode = X86ISD::PCMPESTRI;
11589      X86CC = X86::COND_S;
11590      break;
11591    case Intrinsic::x86_sse42_pcmpistriz128:
11592      Opcode = X86ISD::PCMPISTRI;
11593      X86CC = X86::COND_E;
11594      break;
11595    case Intrinsic::x86_sse42_pcmpestriz128:
11596      Opcode = X86ISD::PCMPESTRI;
11597      X86CC = X86::COND_E;
11598      break;
11599    }
11600    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
11601    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
11602    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
11603    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11604                                DAG.getConstant(X86CC, MVT::i8),
11605                                SDValue(PCMP.getNode(), 1));
11606    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
11607  }
11608
11609  case Intrinsic::x86_sse42_pcmpistri128:
11610  case Intrinsic::x86_sse42_pcmpestri128: {
11611    unsigned Opcode;
11612    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
11613      Opcode = X86ISD::PCMPISTRI;
11614    else
11615      Opcode = X86ISD::PCMPESTRI;
11616
11617    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
11618    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
11619    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
11620  }
11621  case Intrinsic::x86_fma_vfmadd_ps:
11622  case Intrinsic::x86_fma_vfmadd_pd:
11623  case Intrinsic::x86_fma_vfmsub_ps:
11624  case Intrinsic::x86_fma_vfmsub_pd:
11625  case Intrinsic::x86_fma_vfnmadd_ps:
11626  case Intrinsic::x86_fma_vfnmadd_pd:
11627  case Intrinsic::x86_fma_vfnmsub_ps:
11628  case Intrinsic::x86_fma_vfnmsub_pd:
11629  case Intrinsic::x86_fma_vfmaddsub_ps:
11630  case Intrinsic::x86_fma_vfmaddsub_pd:
11631  case Intrinsic::x86_fma_vfmsubadd_ps:
11632  case Intrinsic::x86_fma_vfmsubadd_pd:
11633  case Intrinsic::x86_fma_vfmadd_ps_256:
11634  case Intrinsic::x86_fma_vfmadd_pd_256:
11635  case Intrinsic::x86_fma_vfmsub_ps_256:
11636  case Intrinsic::x86_fma_vfmsub_pd_256:
11637  case Intrinsic::x86_fma_vfnmadd_ps_256:
11638  case Intrinsic::x86_fma_vfnmadd_pd_256:
11639  case Intrinsic::x86_fma_vfnmsub_ps_256:
11640  case Intrinsic::x86_fma_vfnmsub_pd_256:
11641  case Intrinsic::x86_fma_vfmaddsub_ps_256:
11642  case Intrinsic::x86_fma_vfmaddsub_pd_256:
11643  case Intrinsic::x86_fma_vfmsubadd_ps_256:
11644  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
11645    unsigned Opc;
11646    switch (IntNo) {
11647    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
11648    case Intrinsic::x86_fma_vfmadd_ps:
11649    case Intrinsic::x86_fma_vfmadd_pd:
11650    case Intrinsic::x86_fma_vfmadd_ps_256:
11651    case Intrinsic::x86_fma_vfmadd_pd_256:
11652      Opc = X86ISD::FMADD;
11653      break;
11654    case Intrinsic::x86_fma_vfmsub_ps:
11655    case Intrinsic::x86_fma_vfmsub_pd:
11656    case Intrinsic::x86_fma_vfmsub_ps_256:
11657    case Intrinsic::x86_fma_vfmsub_pd_256:
11658      Opc = X86ISD::FMSUB;
11659      break;
11660    case Intrinsic::x86_fma_vfnmadd_ps:
11661    case Intrinsic::x86_fma_vfnmadd_pd:
11662    case Intrinsic::x86_fma_vfnmadd_ps_256:
11663    case Intrinsic::x86_fma_vfnmadd_pd_256:
11664      Opc = X86ISD::FNMADD;
11665      break;
11666    case Intrinsic::x86_fma_vfnmsub_ps:
11667    case Intrinsic::x86_fma_vfnmsub_pd:
11668    case Intrinsic::x86_fma_vfnmsub_ps_256:
11669    case Intrinsic::x86_fma_vfnmsub_pd_256:
11670      Opc = X86ISD::FNMSUB;
11671      break;
11672    case Intrinsic::x86_fma_vfmaddsub_ps:
11673    case Intrinsic::x86_fma_vfmaddsub_pd:
11674    case Intrinsic::x86_fma_vfmaddsub_ps_256:
11675    case Intrinsic::x86_fma_vfmaddsub_pd_256:
11676      Opc = X86ISD::FMADDSUB;
11677      break;
11678    case Intrinsic::x86_fma_vfmsubadd_ps:
11679    case Intrinsic::x86_fma_vfmsubadd_pd:
11680    case Intrinsic::x86_fma_vfmsubadd_ps_256:
11681    case Intrinsic::x86_fma_vfmsubadd_pd_256:
11682      Opc = X86ISD::FMSUBADD;
11683      break;
11684    }
11685
11686    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
11687                       Op.getOperand(2), Op.getOperand(3));
11688  }
11689  }
11690}
11691
11692static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11693                             SDValue Base, SDValue Index,
11694                             SDValue ScaleOp, SDValue Chain,
11695                             const X86Subtarget * Subtarget) {
11696  SDLoc dl(Op);
11697  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11698  assert(C && "Invalid scale type");
11699  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11700  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
11701  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11702                                Index.getValueType().getVectorNumElements());
11703  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
11704  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
11705  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11706  SDValue Segment = DAG.getRegister(0, MVT::i32);
11707  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
11708  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11709  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
11710  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
11711}
11712
11713static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11714                              SDValue Src, SDValue Mask, SDValue Base,
11715                              SDValue Index, SDValue ScaleOp, SDValue Chain,
11716                              const X86Subtarget * Subtarget) {
11717  SDLoc dl(Op);
11718  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11719  assert(C && "Invalid scale type");
11720  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11721  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11722                                Index.getValueType().getVectorNumElements());
11723  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
11724  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
11725  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11726  SDValue Segment = DAG.getRegister(0, MVT::i32);
11727  if (Src.getOpcode() == ISD::UNDEF)
11728    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
11729  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
11730  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11731  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
11732  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
11733}
11734
11735static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11736                              SDValue Src, SDValue Base, SDValue Index,
11737                              SDValue ScaleOp, SDValue Chain) {
11738  SDLoc dl(Op);
11739  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11740  assert(C && "Invalid scale type");
11741  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11742  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11743  SDValue Segment = DAG.getRegister(0, MVT::i32);
11744  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11745                                Index.getValueType().getVectorNumElements());
11746  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
11747  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
11748  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
11749  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11750  return SDValue(Res, 1);
11751}
11752
11753static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
11754                               SDValue Src, SDValue Mask, SDValue Base,
11755                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
11756  SDLoc dl(Op);
11757  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
11758  assert(C && "Invalid scale type");
11759  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
11760  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
11761  SDValue Segment = DAG.getRegister(0, MVT::i32);
11762  EVT MaskVT = MVT::getVectorVT(MVT::i1,
11763                                Index.getValueType().getVectorNumElements());
11764  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
11765  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
11766  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
11767  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
11768  return SDValue(Res, 1);
11769}
11770
11771static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
11772                                      SelectionDAG &DAG) {
11773  SDLoc dl(Op);
11774  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11775  switch (IntNo) {
11776  default: return SDValue();    // Don't custom lower most intrinsics.
11777
11778  // RDRAND/RDSEED intrinsics.
11779  case Intrinsic::x86_rdrand_16:
11780  case Intrinsic::x86_rdrand_32:
11781  case Intrinsic::x86_rdrand_64:
11782  case Intrinsic::x86_rdseed_16:
11783  case Intrinsic::x86_rdseed_32:
11784  case Intrinsic::x86_rdseed_64: {
11785    unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
11786                       IntNo == Intrinsic::x86_rdseed_32 ||
11787                       IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
11788                                                            X86ISD::RDRAND;
11789    // Emit the node with the right value type.
11790    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
11791    SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
11792
11793    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
11794    // Otherwise return the value from Rand, which is always 0, casted to i32.
11795    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
11796                      DAG.getConstant(1, Op->getValueType(1)),
11797                      DAG.getConstant(X86::COND_B, MVT::i32),
11798                      SDValue(Result.getNode(), 1) };
11799    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
11800                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
11801                                  Ops, array_lengthof(Ops));
11802
11803    // Return { result, isValid, chain }.
11804    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
11805                       SDValue(Result.getNode(), 2));
11806  }
11807  //int_gather(index, base, scale);
11808  case Intrinsic::x86_avx512_gather_qpd_512:
11809  case Intrinsic::x86_avx512_gather_qps_512:
11810  case Intrinsic::x86_avx512_gather_dpd_512:
11811  case Intrinsic::x86_avx512_gather_qpi_512:
11812  case Intrinsic::x86_avx512_gather_qpq_512:
11813  case Intrinsic::x86_avx512_gather_dpq_512:
11814  case Intrinsic::x86_avx512_gather_dps_512:
11815  case Intrinsic::x86_avx512_gather_dpi_512: {
11816    unsigned Opc;
11817    switch (IntNo) {
11818      default: llvm_unreachable("Unexpected intrinsic!");
11819      case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
11820      case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
11821      case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
11822      case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
11823      case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
11824      case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
11825      case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
11826      case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
11827    }
11828    SDValue Chain = Op.getOperand(0);
11829    SDValue Index = Op.getOperand(2);
11830    SDValue Base  = Op.getOperand(3);
11831    SDValue Scale = Op.getOperand(4);
11832    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
11833  }
11834  //int_gather_mask(v1, mask, index, base, scale);
11835  case Intrinsic::x86_avx512_gather_qps_mask_512:
11836  case Intrinsic::x86_avx512_gather_qpd_mask_512:
11837  case Intrinsic::x86_avx512_gather_dpd_mask_512:
11838  case Intrinsic::x86_avx512_gather_dps_mask_512:
11839  case Intrinsic::x86_avx512_gather_qpi_mask_512:
11840  case Intrinsic::x86_avx512_gather_qpq_mask_512:
11841  case Intrinsic::x86_avx512_gather_dpi_mask_512:
11842  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
11843    unsigned Opc;
11844    switch (IntNo) {
11845      default: llvm_unreachable("Unexpected intrinsic!");
11846      case Intrinsic::x86_avx512_gather_qps_mask_512:
11847        Opc = X86::VGATHERQPSZrm; break;
11848      case Intrinsic::x86_avx512_gather_qpd_mask_512:
11849        Opc = X86::VGATHERQPDZrm; break;
11850      case Intrinsic::x86_avx512_gather_dpd_mask_512:
11851        Opc = X86::VGATHERDPDZrm; break;
11852      case Intrinsic::x86_avx512_gather_dps_mask_512:
11853        Opc = X86::VGATHERDPSZrm; break;
11854      case Intrinsic::x86_avx512_gather_qpi_mask_512:
11855        Opc = X86::VPGATHERQDZrm; break;
11856      case Intrinsic::x86_avx512_gather_qpq_mask_512:
11857        Opc = X86::VPGATHERQQZrm; break;
11858      case Intrinsic::x86_avx512_gather_dpi_mask_512:
11859        Opc = X86::VPGATHERDDZrm; break;
11860      case Intrinsic::x86_avx512_gather_dpq_mask_512:
11861        Opc = X86::VPGATHERDQZrm; break;
11862    }
11863    SDValue Chain = Op.getOperand(0);
11864    SDValue Src   = Op.getOperand(2);
11865    SDValue Mask  = Op.getOperand(3);
11866    SDValue Index = Op.getOperand(4);
11867    SDValue Base  = Op.getOperand(5);
11868    SDValue Scale = Op.getOperand(6);
11869    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
11870                          Subtarget);
11871  }
11872  //int_scatter(base, index, v1, scale);
11873  case Intrinsic::x86_avx512_scatter_qpd_512:
11874  case Intrinsic::x86_avx512_scatter_qps_512:
11875  case Intrinsic::x86_avx512_scatter_dpd_512:
11876  case Intrinsic::x86_avx512_scatter_qpi_512:
11877  case Intrinsic::x86_avx512_scatter_qpq_512:
11878  case Intrinsic::x86_avx512_scatter_dpq_512:
11879  case Intrinsic::x86_avx512_scatter_dps_512:
11880  case Intrinsic::x86_avx512_scatter_dpi_512: {
11881    unsigned Opc;
11882    switch (IntNo) {
11883      default: llvm_unreachable("Unexpected intrinsic!");
11884      case Intrinsic::x86_avx512_scatter_qpd_512:
11885        Opc = X86::VSCATTERQPDZmr; break;
11886      case Intrinsic::x86_avx512_scatter_qps_512:
11887        Opc = X86::VSCATTERQPSZmr; break;
11888      case Intrinsic::x86_avx512_scatter_dpd_512:
11889        Opc = X86::VSCATTERDPDZmr; break;
11890      case Intrinsic::x86_avx512_scatter_dps_512:
11891        Opc = X86::VSCATTERDPSZmr; break;
11892      case Intrinsic::x86_avx512_scatter_qpi_512:
11893        Opc = X86::VPSCATTERQDZmr; break;
11894      case Intrinsic::x86_avx512_scatter_qpq_512:
11895        Opc = X86::VPSCATTERQQZmr; break;
11896      case Intrinsic::x86_avx512_scatter_dpq_512:
11897        Opc = X86::VPSCATTERDQZmr; break;
11898      case Intrinsic::x86_avx512_scatter_dpi_512:
11899        Opc = X86::VPSCATTERDDZmr; break;
11900    }
11901    SDValue Chain = Op.getOperand(0);
11902    SDValue Base  = Op.getOperand(2);
11903    SDValue Index = Op.getOperand(3);
11904    SDValue Src   = Op.getOperand(4);
11905    SDValue Scale = Op.getOperand(5);
11906    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
11907  }
11908  //int_scatter_mask(base, mask, index, v1, scale);
11909  case Intrinsic::x86_avx512_scatter_qps_mask_512:
11910  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
11911  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
11912  case Intrinsic::x86_avx512_scatter_dps_mask_512:
11913  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
11914  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
11915  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
11916  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
11917    unsigned Opc;
11918    switch (IntNo) {
11919      default: llvm_unreachable("Unexpected intrinsic!");
11920      case Intrinsic::x86_avx512_scatter_qpd_mask_512:
11921        Opc = X86::VSCATTERQPDZmr; break;
11922      case Intrinsic::x86_avx512_scatter_qps_mask_512:
11923        Opc = X86::VSCATTERQPSZmr; break;
11924      case Intrinsic::x86_avx512_scatter_dpd_mask_512:
11925        Opc = X86::VSCATTERDPDZmr; break;
11926      case Intrinsic::x86_avx512_scatter_dps_mask_512:
11927        Opc = X86::VSCATTERDPSZmr; break;
11928      case Intrinsic::x86_avx512_scatter_qpi_mask_512:
11929        Opc = X86::VPSCATTERQDZmr; break;
11930      case Intrinsic::x86_avx512_scatter_qpq_mask_512:
11931        Opc = X86::VPSCATTERQQZmr; break;
11932      case Intrinsic::x86_avx512_scatter_dpq_mask_512:
11933        Opc = X86::VPSCATTERDQZmr; break;
11934      case Intrinsic::x86_avx512_scatter_dpi_mask_512:
11935        Opc = X86::VPSCATTERDDZmr; break;
11936    }
11937    SDValue Chain = Op.getOperand(0);
11938    SDValue Base  = Op.getOperand(2);
11939    SDValue Mask  = Op.getOperand(3);
11940    SDValue Index = Op.getOperand(4);
11941    SDValue Src   = Op.getOperand(5);
11942    SDValue Scale = Op.getOperand(6);
11943    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
11944  }
11945  // XTEST intrinsics.
11946  case Intrinsic::x86_xtest: {
11947    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
11948    SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
11949    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
11950                                DAG.getConstant(X86::COND_NE, MVT::i8),
11951                                InTrans);
11952    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
11953    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
11954                       Ret, SDValue(InTrans.getNode(), 1));
11955  }
11956  }
11957}
11958
11959SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
11960                                           SelectionDAG &DAG) const {
11961  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
11962  MFI->setReturnAddressIsTaken(true);
11963
11964  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11965  SDLoc dl(Op);
11966  EVT PtrVT = getPointerTy();
11967
11968  if (Depth > 0) {
11969    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11970    const X86RegisterInfo *RegInfo =
11971      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
11972    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
11973    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
11974                       DAG.getNode(ISD::ADD, dl, PtrVT,
11975                                   FrameAddr, Offset),
11976                       MachinePointerInfo(), false, false, false, 0);
11977  }
11978
11979  // Just load the return address.
11980  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
11981  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
11982                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
11983}
11984
11985SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
11986  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
11987  MFI->setFrameAddressIsTaken(true);
11988
11989  EVT VT = Op.getValueType();
11990  SDLoc dl(Op);  // FIXME probably not meaningful
11991  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11992  const X86RegisterInfo *RegInfo =
11993    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
11994  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
11995  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
11996          (FrameReg == X86::EBP && VT == MVT::i32)) &&
11997         "Invalid Frame Register!");
11998  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
11999  while (Depth--)
12000    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
12001                            MachinePointerInfo(),
12002                            false, false, false, 0);
12003  return FrameAddr;
12004}
12005
12006SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
12007                                                     SelectionDAG &DAG) const {
12008  const X86RegisterInfo *RegInfo =
12009    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
12010  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
12011}
12012
12013SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
12014  SDValue Chain     = Op.getOperand(0);
12015  SDValue Offset    = Op.getOperand(1);
12016  SDValue Handler   = Op.getOperand(2);
12017  SDLoc dl      (Op);
12018
12019  EVT PtrVT = getPointerTy();
12020  const X86RegisterInfo *RegInfo =
12021    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
12022  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
12023  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
12024          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
12025         "Invalid Frame Register!");
12026  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
12027  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
12028
12029  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
12030                                 DAG.getIntPtrConstant(RegInfo->getSlotSize()));
12031  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
12032  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
12033                       false, false, 0);
12034  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
12035
12036  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
12037                     DAG.getRegister(StoreAddrReg, PtrVT));
12038}
12039
12040SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
12041                                               SelectionDAG &DAG) const {
12042  SDLoc DL(Op);
12043  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
12044                     DAG.getVTList(MVT::i32, MVT::Other),
12045                     Op.getOperand(0), Op.getOperand(1));
12046}
12047
12048SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
12049                                                SelectionDAG &DAG) const {
12050  SDLoc DL(Op);
12051  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
12052                     Op.getOperand(0), Op.getOperand(1));
12053}
12054
12055static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
12056  return Op.getOperand(0);
12057}
12058
12059SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
12060                                                SelectionDAG &DAG) const {
12061  SDValue Root = Op.getOperand(0);
12062  SDValue Trmp = Op.getOperand(1); // trampoline
12063  SDValue FPtr = Op.getOperand(2); // nested function
12064  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
12065  SDLoc dl (Op);
12066
12067  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12068  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
12069
12070  if (Subtarget->is64Bit()) {
12071    SDValue OutChains[6];
12072
12073    // Large code-model.
12074    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
12075    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
12076
12077    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
12078    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
12079
12080    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
12081
12082    // Load the pointer to the nested function into R11.
12083    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
12084    SDValue Addr = Trmp;
12085    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
12086                                Addr, MachinePointerInfo(TrmpAddr),
12087                                false, false, 0);
12088
12089    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12090                       DAG.getConstant(2, MVT::i64));
12091    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
12092                                MachinePointerInfo(TrmpAddr, 2),
12093                                false, false, 2);
12094
12095    // Load the 'nest' parameter value into R10.
12096    // R10 is specified in X86CallingConv.td
12097    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
12098    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12099                       DAG.getConstant(10, MVT::i64));
12100    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
12101                                Addr, MachinePointerInfo(TrmpAddr, 10),
12102                                false, false, 0);
12103
12104    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12105                       DAG.getConstant(12, MVT::i64));
12106    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
12107                                MachinePointerInfo(TrmpAddr, 12),
12108                                false, false, 2);
12109
12110    // Jump to the nested function.
12111    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
12112    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12113                       DAG.getConstant(20, MVT::i64));
12114    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
12115                                Addr, MachinePointerInfo(TrmpAddr, 20),
12116                                false, false, 0);
12117
12118    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
12119    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
12120                       DAG.getConstant(22, MVT::i64));
12121    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
12122                                MachinePointerInfo(TrmpAddr, 22),
12123                                false, false, 0);
12124
12125    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
12126  } else {
12127    const Function *Func =
12128      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
12129    CallingConv::ID CC = Func->getCallingConv();
12130    unsigned NestReg;
12131
12132    switch (CC) {
12133    default:
12134      llvm_unreachable("Unsupported calling convention");
12135    case CallingConv::C:
12136    case CallingConv::X86_StdCall: {
12137      // Pass 'nest' parameter in ECX.
12138      // Must be kept in sync with X86CallingConv.td
12139      NestReg = X86::ECX;
12140
12141      // Check that ECX wasn't needed by an 'inreg' parameter.
12142      FunctionType *FTy = Func->getFunctionType();
12143      const AttributeSet &Attrs = Func->getAttributes();
12144
12145      if (!Attrs.isEmpty() && !Func->isVarArg()) {
12146        unsigned InRegCount = 0;
12147        unsigned Idx = 1;
12148
12149        for (FunctionType::param_iterator I = FTy->param_begin(),
12150             E = FTy->param_end(); I != E; ++I, ++Idx)
12151          if (Attrs.hasAttribute(Idx, Attribute::InReg))
12152            // FIXME: should only count parameters that are lowered to integers.
12153            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
12154
12155        if (InRegCount > 2) {
12156          report_fatal_error("Nest register in use - reduce number of inreg"
12157                             " parameters!");
12158        }
12159      }
12160      break;
12161    }
12162    case CallingConv::X86_FastCall:
12163    case CallingConv::X86_ThisCall:
12164    case CallingConv::Fast:
12165      // Pass 'nest' parameter in EAX.
12166      // Must be kept in sync with X86CallingConv.td
12167      NestReg = X86::EAX;
12168      break;
12169    }
12170
12171    SDValue OutChains[4];
12172    SDValue Addr, Disp;
12173
12174    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12175                       DAG.getConstant(10, MVT::i32));
12176    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
12177
12178    // This is storing the opcode for MOV32ri.
12179    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
12180    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
12181    OutChains[0] = DAG.getStore(Root, dl,
12182                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
12183                                Trmp, MachinePointerInfo(TrmpAddr),
12184                                false, false, 0);
12185
12186    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12187                       DAG.getConstant(1, MVT::i32));
12188    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
12189                                MachinePointerInfo(TrmpAddr, 1),
12190                                false, false, 1);
12191
12192    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
12193    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12194                       DAG.getConstant(5, MVT::i32));
12195    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
12196                                MachinePointerInfo(TrmpAddr, 5),
12197                                false, false, 1);
12198
12199    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
12200                       DAG.getConstant(6, MVT::i32));
12201    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
12202                                MachinePointerInfo(TrmpAddr, 6),
12203                                false, false, 1);
12204
12205    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
12206  }
12207}
12208
12209SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
12210                                            SelectionDAG &DAG) const {
12211  /*
12212   The rounding mode is in bits 11:10 of FPSR, and has the following
12213   settings:
12214     00 Round to nearest
12215     01 Round to -inf
12216     10 Round to +inf
12217     11 Round to 0
12218
12219  FLT_ROUNDS, on the other hand, expects the following:
12220    -1 Undefined
12221     0 Round to 0
12222     1 Round to nearest
12223     2 Round to +inf
12224     3 Round to -inf
12225
12226  To perform the conversion, we do:
12227    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
12228  */
12229
12230  MachineFunction &MF = DAG.getMachineFunction();
12231  const TargetMachine &TM = MF.getTarget();
12232  const TargetFrameLowering &TFI = *TM.getFrameLowering();
12233  unsigned StackAlignment = TFI.getStackAlignment();
12234  EVT VT = Op.getValueType();
12235  SDLoc DL(Op);
12236
12237  // Save FP Control Word to stack slot
12238  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
12239  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
12240
12241  MachineMemOperand *MMO =
12242   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
12243                           MachineMemOperand::MOStore, 2, 2);
12244
12245  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
12246  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
12247                                          DAG.getVTList(MVT::Other),
12248                                          Ops, array_lengthof(Ops), MVT::i16,
12249                                          MMO);
12250
12251  // Load FP Control Word from stack slot
12252  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
12253                            MachinePointerInfo(), false, false, false, 0);
12254
12255  // Transform as necessary
12256  SDValue CWD1 =
12257    DAG.getNode(ISD::SRL, DL, MVT::i16,
12258                DAG.getNode(ISD::AND, DL, MVT::i16,
12259                            CWD, DAG.getConstant(0x800, MVT::i16)),
12260                DAG.getConstant(11, MVT::i8));
12261  SDValue CWD2 =
12262    DAG.getNode(ISD::SRL, DL, MVT::i16,
12263                DAG.getNode(ISD::AND, DL, MVT::i16,
12264                            CWD, DAG.getConstant(0x400, MVT::i16)),
12265                DAG.getConstant(9, MVT::i8));
12266
12267  SDValue RetVal =
12268    DAG.getNode(ISD::AND, DL, MVT::i16,
12269                DAG.getNode(ISD::ADD, DL, MVT::i16,
12270                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
12271                            DAG.getConstant(1, MVT::i16)),
12272                DAG.getConstant(3, MVT::i16));
12273
12274  return DAG.getNode((VT.getSizeInBits() < 16 ?
12275                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
12276}
12277
12278static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
12279  EVT VT = Op.getValueType();
12280  EVT OpVT = VT;
12281  unsigned NumBits = VT.getSizeInBits();
12282  SDLoc dl(Op);
12283
12284  Op = Op.getOperand(0);
12285  if (VT == MVT::i8) {
12286    // Zero extend to i32 since there is not an i8 bsr.
12287    OpVT = MVT::i32;
12288    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
12289  }
12290
12291  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
12292  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
12293  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
12294
12295  // If src is zero (i.e. bsr sets ZF), returns NumBits.
12296  SDValue Ops[] = {
12297    Op,
12298    DAG.getConstant(NumBits+NumBits-1, OpVT),
12299    DAG.getConstant(X86::COND_E, MVT::i8),
12300    Op.getValue(1)
12301  };
12302  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
12303
12304  // Finally xor with NumBits-1.
12305  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
12306
12307  if (VT == MVT::i8)
12308    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
12309  return Op;
12310}
12311
12312static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
12313  EVT VT = Op.getValueType();
12314  EVT OpVT = VT;
12315  unsigned NumBits = VT.getSizeInBits();
12316  SDLoc dl(Op);
12317
12318  Op = Op.getOperand(0);
12319  if (VT == MVT::i8) {
12320    // Zero extend to i32 since there is not an i8 bsr.
12321    OpVT = MVT::i32;
12322    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
12323  }
12324
12325  // Issue a bsr (scan bits in reverse).
12326  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
12327  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
12328
12329  // And xor with NumBits-1.
12330  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
12331
12332  if (VT == MVT::i8)
12333    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
12334  return Op;
12335}
12336
12337static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
12338  EVT VT = Op.getValueType();
12339  unsigned NumBits = VT.getSizeInBits();
12340  SDLoc dl(Op);
12341  Op = Op.getOperand(0);
12342
12343  // Issue a bsf (scan bits forward) which also sets EFLAGS.
12344  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
12345  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
12346
12347  // If src is zero (i.e. bsf sets ZF), returns NumBits.
12348  SDValue Ops[] = {
12349    Op,
12350    DAG.getConstant(NumBits, VT),
12351    DAG.getConstant(X86::COND_E, MVT::i8),
12352    Op.getValue(1)
12353  };
12354  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
12355}
12356
12357// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
12358// ones, and then concatenate the result back.
12359static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
12360  EVT VT = Op.getValueType();
12361
12362  assert(VT.is256BitVector() && VT.isInteger() &&
12363         "Unsupported value type for operation");
12364
12365  unsigned NumElems = VT.getVectorNumElements();
12366  SDLoc dl(Op);
12367
12368  // Extract the LHS vectors
12369  SDValue LHS = Op.getOperand(0);
12370  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
12371  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
12372
12373  // Extract the RHS vectors
12374  SDValue RHS = Op.getOperand(1);
12375  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
12376  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
12377
12378  MVT EltVT = VT.getVectorElementType().getSimpleVT();
12379  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
12380
12381  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
12382                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
12383                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
12384}
12385
12386static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
12387  assert(Op.getValueType().is256BitVector() &&
12388         Op.getValueType().isInteger() &&
12389         "Only handle AVX 256-bit vector integer operation");
12390  return Lower256IntArith(Op, DAG);
12391}
12392
12393static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
12394  assert(Op.getValueType().is256BitVector() &&
12395         Op.getValueType().isInteger() &&
12396         "Only handle AVX 256-bit vector integer operation");
12397  return Lower256IntArith(Op, DAG);
12398}
12399
12400static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
12401                        SelectionDAG &DAG) {
12402  SDLoc dl(Op);
12403  EVT VT = Op.getValueType();
12404
12405  // Decompose 256-bit ops into smaller 128-bit ops.
12406  if (VT.is256BitVector() && !Subtarget->hasInt256())
12407    return Lower256IntArith(Op, DAG);
12408
12409  SDValue A = Op.getOperand(0);
12410  SDValue B = Op.getOperand(1);
12411
12412  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
12413  if (VT == MVT::v4i32) {
12414    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
12415           "Should not custom lower when pmuldq is available!");
12416
12417    // Extract the odd parts.
12418    static const int UnpackMask[] = { 1, -1, 3, -1 };
12419    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
12420    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
12421
12422    // Multiply the even parts.
12423    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
12424    // Now multiply odd parts.
12425    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
12426
12427    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
12428    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
12429
12430    // Merge the two vectors back together with a shuffle. This expands into 2
12431    // shuffles.
12432    static const int ShufMask[] = { 0, 4, 2, 6 };
12433    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
12434  }
12435
12436  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
12437         "Only know how to lower V2I64/V4I64/V8I64 multiply");
12438
12439  //  Ahi = psrlqi(a, 32);
12440  //  Bhi = psrlqi(b, 32);
12441  //
12442  //  AloBlo = pmuludq(a, b);
12443  //  AloBhi = pmuludq(a, Bhi);
12444  //  AhiBlo = pmuludq(Ahi, b);
12445
12446  //  AloBhi = psllqi(AloBhi, 32);
12447  //  AhiBlo = psllqi(AhiBlo, 32);
12448  //  return AloBlo + AloBhi + AhiBlo;
12449
12450  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
12451  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
12452
12453  // Bit cast to 32-bit vectors for MULUDQ
12454  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
12455                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
12456  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
12457  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
12458  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
12459  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
12460
12461  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
12462  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
12463  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
12464
12465  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
12466  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
12467
12468  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
12469  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
12470}
12471
12472static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
12473  EVT VT = Op.getValueType();
12474  EVT EltTy = VT.getVectorElementType();
12475  unsigned NumElts = VT.getVectorNumElements();
12476  SDValue N0 = Op.getOperand(0);
12477  SDLoc dl(Op);
12478
12479  // Lower sdiv X, pow2-const.
12480  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
12481  if (!C)
12482    return SDValue();
12483
12484  APInt SplatValue, SplatUndef;
12485  unsigned SplatBitSize;
12486  bool HasAnyUndefs;
12487  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
12488                          HasAnyUndefs) ||
12489      EltTy.getSizeInBits() < SplatBitSize)
12490    return SDValue();
12491
12492  if ((SplatValue != 0) &&
12493      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
12494    unsigned Lg2 = SplatValue.countTrailingZeros();
12495    // Splat the sign bit.
12496    SmallVector<SDValue, 16> Sz(NumElts,
12497                                DAG.getConstant(EltTy.getSizeInBits() - 1,
12498                                                EltTy));
12499    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
12500                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
12501                                          NumElts));
12502    // Add (N0 < 0) ? abs2 - 1 : 0;
12503    SmallVector<SDValue, 16> Amt(NumElts,
12504                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
12505                                                 EltTy));
12506    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
12507                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
12508                                          NumElts));
12509    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
12510    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
12511    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
12512                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
12513                                          NumElts));
12514
12515    // If we're dividing by a positive value, we're done.  Otherwise, we must
12516    // negate the result.
12517    if (SplatValue.isNonNegative())
12518      return SRA;
12519
12520    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
12521    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
12522    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
12523  }
12524  return SDValue();
12525}
12526
12527static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
12528                                         const X86Subtarget *Subtarget) {
12529  EVT VT = Op.getValueType();
12530  SDLoc dl(Op);
12531  SDValue R = Op.getOperand(0);
12532  SDValue Amt = Op.getOperand(1);
12533
12534  // Optimize shl/srl/sra with constant shift amount.
12535  if (isSplatVector(Amt.getNode())) {
12536    SDValue SclrAmt = Amt->getOperand(0);
12537    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
12538      uint64_t ShiftAmt = C->getZExtValue();
12539
12540      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
12541          (Subtarget->hasInt256() &&
12542           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
12543          (Subtarget->hasAVX512() &&
12544           (VT == MVT::v8i64 || VT == MVT::v16i32))) {
12545        if (Op.getOpcode() == ISD::SHL)
12546          return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
12547                                            DAG);
12548        if (Op.getOpcode() == ISD::SRL)
12549          return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
12550                                            DAG);
12551        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
12552          return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
12553                                            DAG);
12554      }
12555
12556      if (VT == MVT::v16i8) {
12557        if (Op.getOpcode() == ISD::SHL) {
12558          // Make a large shift.
12559          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
12560                                                   MVT::v8i16, R, ShiftAmt,
12561                                                   DAG);
12562          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
12563          // Zero out the rightmost bits.
12564          SmallVector<SDValue, 16> V(16,
12565                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
12566                                                     MVT::i8));
12567          return DAG.getNode(ISD::AND, dl, VT, SHL,
12568                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
12569        }
12570        if (Op.getOpcode() == ISD::SRL) {
12571          // Make a large shift.
12572          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
12573                                                   MVT::v8i16, R, ShiftAmt,
12574                                                   DAG);
12575          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
12576          // Zero out the leftmost bits.
12577          SmallVector<SDValue, 16> V(16,
12578                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
12579                                                     MVT::i8));
12580          return DAG.getNode(ISD::AND, dl, VT, SRL,
12581                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
12582        }
12583        if (Op.getOpcode() == ISD::SRA) {
12584          if (ShiftAmt == 7) {
12585            // R s>> 7  ===  R s< 0
12586            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
12587            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
12588          }
12589
12590          // R s>> a === ((R u>> a) ^ m) - m
12591          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
12592          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
12593                                                         MVT::i8));
12594          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
12595          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
12596          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
12597          return Res;
12598        }
12599        llvm_unreachable("Unknown shift opcode.");
12600      }
12601
12602      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
12603        if (Op.getOpcode() == ISD::SHL) {
12604          // Make a large shift.
12605          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
12606                                                   MVT::v16i16, R, ShiftAmt,
12607                                                   DAG);
12608          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
12609          // Zero out the rightmost bits.
12610          SmallVector<SDValue, 32> V(32,
12611                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
12612                                                     MVT::i8));
12613          return DAG.getNode(ISD::AND, dl, VT, SHL,
12614                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
12615        }
12616        if (Op.getOpcode() == ISD::SRL) {
12617          // Make a large shift.
12618          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
12619                                                   MVT::v16i16, R, ShiftAmt,
12620                                                   DAG);
12621          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
12622          // Zero out the leftmost bits.
12623          SmallVector<SDValue, 32> V(32,
12624                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
12625                                                     MVT::i8));
12626          return DAG.getNode(ISD::AND, dl, VT, SRL,
12627                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
12628        }
12629        if (Op.getOpcode() == ISD::SRA) {
12630          if (ShiftAmt == 7) {
12631            // R s>> 7  ===  R s< 0
12632            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
12633            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
12634          }
12635
12636          // R s>> a === ((R u>> a) ^ m) - m
12637          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
12638          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
12639                                                         MVT::i8));
12640          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
12641          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
12642          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
12643          return Res;
12644        }
12645        llvm_unreachable("Unknown shift opcode.");
12646      }
12647    }
12648  }
12649
12650  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
12651  if (!Subtarget->is64Bit() &&
12652      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
12653      Amt.getOpcode() == ISD::BITCAST &&
12654      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
12655    Amt = Amt.getOperand(0);
12656    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
12657                     VT.getVectorNumElements();
12658    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
12659    uint64_t ShiftAmt = 0;
12660    for (unsigned i = 0; i != Ratio; ++i) {
12661      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
12662      if (C == 0)
12663        return SDValue();
12664      // 6 == Log2(64)
12665      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
12666    }
12667    // Check remaining shift amounts.
12668    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
12669      uint64_t ShAmt = 0;
12670      for (unsigned j = 0; j != Ratio; ++j) {
12671        ConstantSDNode *C =
12672          dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
12673        if (C == 0)
12674          return SDValue();
12675        // 6 == Log2(64)
12676        ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
12677      }
12678      if (ShAmt != ShiftAmt)
12679        return SDValue();
12680    }
12681    switch (Op.getOpcode()) {
12682    default:
12683      llvm_unreachable("Unknown shift opcode!");
12684    case ISD::SHL:
12685      return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
12686                                        DAG);
12687    case ISD::SRL:
12688      return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
12689                                        DAG);
12690    case ISD::SRA:
12691      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
12692                                        DAG);
12693    }
12694  }
12695
12696  return SDValue();
12697}
12698
12699static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
12700                                        const X86Subtarget* Subtarget) {
12701  EVT VT = Op.getValueType();
12702  SDLoc dl(Op);
12703  SDValue R = Op.getOperand(0);
12704  SDValue Amt = Op.getOperand(1);
12705
12706  if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
12707      VT == MVT::v4i32 || VT == MVT::v8i16 ||
12708      (Subtarget->hasInt256() &&
12709       ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
12710        VT == MVT::v8i32 || VT == MVT::v16i16)) ||
12711       (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
12712    SDValue BaseShAmt;
12713    EVT EltVT = VT.getVectorElementType();
12714
12715    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
12716      unsigned NumElts = VT.getVectorNumElements();
12717      unsigned i, j;
12718      for (i = 0; i != NumElts; ++i) {
12719        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
12720          continue;
12721        break;
12722      }
12723      for (j = i; j != NumElts; ++j) {
12724        SDValue Arg = Amt.getOperand(j);
12725        if (Arg.getOpcode() == ISD::UNDEF) continue;
12726        if (Arg != Amt.getOperand(i))
12727          break;
12728      }
12729      if (i != NumElts && j == NumElts)
12730        BaseShAmt = Amt.getOperand(i);
12731    } else {
12732      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12733        Amt = Amt.getOperand(0);
12734      if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
12735               cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
12736        SDValue InVec = Amt.getOperand(0);
12737        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
12738          unsigned NumElts = InVec.getValueType().getVectorNumElements();
12739          unsigned i = 0;
12740          for (; i != NumElts; ++i) {
12741            SDValue Arg = InVec.getOperand(i);
12742            if (Arg.getOpcode() == ISD::UNDEF) continue;
12743            BaseShAmt = Arg;
12744            break;
12745          }
12746        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
12747           if (ConstantSDNode *C =
12748               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
12749             unsigned SplatIdx =
12750               cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
12751             if (C->getZExtValue() == SplatIdx)
12752               BaseShAmt = InVec.getOperand(1);
12753           }
12754        }
12755        if (BaseShAmt.getNode() == 0)
12756          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
12757                                  DAG.getIntPtrConstant(0));
12758      }
12759    }
12760
12761    if (BaseShAmt.getNode()) {
12762      if (EltVT.bitsGT(MVT::i32))
12763        BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
12764      else if (EltVT.bitsLT(MVT::i32))
12765        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
12766
12767      switch (Op.getOpcode()) {
12768      default:
12769        llvm_unreachable("Unknown shift opcode!");
12770      case ISD::SHL:
12771        switch (VT.getSimpleVT().SimpleTy) {
12772        default: return SDValue();
12773        case MVT::v2i64:
12774        case MVT::v4i32:
12775        case MVT::v8i16:
12776        case MVT::v4i64:
12777        case MVT::v8i32:
12778        case MVT::v16i16:
12779        case MVT::v16i32:
12780        case MVT::v8i64:
12781          return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
12782        }
12783      case ISD::SRA:
12784        switch (VT.getSimpleVT().SimpleTy) {
12785        default: return SDValue();
12786        case MVT::v4i32:
12787        case MVT::v8i16:
12788        case MVT::v8i32:
12789        case MVT::v16i16:
12790        case MVT::v16i32:
12791        case MVT::v8i64:
12792          return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
12793        }
12794      case ISD::SRL:
12795        switch (VT.getSimpleVT().SimpleTy) {
12796        default: return SDValue();
12797        case MVT::v2i64:
12798        case MVT::v4i32:
12799        case MVT::v8i16:
12800        case MVT::v4i64:
12801        case MVT::v8i32:
12802        case MVT::v16i16:
12803        case MVT::v16i32:
12804        case MVT::v8i64:
12805          return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
12806        }
12807      }
12808    }
12809  }
12810
12811  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
12812  if (!Subtarget->is64Bit() &&
12813      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
12814      (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
12815      Amt.getOpcode() == ISD::BITCAST &&
12816      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
12817    Amt = Amt.getOperand(0);
12818    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
12819                     VT.getVectorNumElements();
12820    std::vector<SDValue> Vals(Ratio);
12821    for (unsigned i = 0; i != Ratio; ++i)
12822      Vals[i] = Amt.getOperand(i);
12823    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
12824      for (unsigned j = 0; j != Ratio; ++j)
12825        if (Vals[j] != Amt.getOperand(i + j))
12826          return SDValue();
12827    }
12828    switch (Op.getOpcode()) {
12829    default:
12830      llvm_unreachable("Unknown shift opcode!");
12831    case ISD::SHL:
12832      return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
12833    case ISD::SRL:
12834      return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
12835    case ISD::SRA:
12836      return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
12837    }
12838  }
12839
12840  return SDValue();
12841}
12842
12843static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
12844                          SelectionDAG &DAG) {
12845
12846  EVT VT = Op.getValueType();
12847  SDLoc dl(Op);
12848  SDValue R = Op.getOperand(0);
12849  SDValue Amt = Op.getOperand(1);
12850  SDValue V;
12851
12852  if (!Subtarget->hasSSE2())
12853    return SDValue();
12854
12855  V = LowerScalarImmediateShift(Op, DAG, Subtarget);
12856  if (V.getNode())
12857    return V;
12858
12859  V = LowerScalarVariableShift(Op, DAG, Subtarget);
12860  if (V.getNode())
12861      return V;
12862
12863  if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
12864    return Op;
12865  // AVX2 has VPSLLV/VPSRAV/VPSRLV.
12866  if (Subtarget->hasInt256()) {
12867    if (Op.getOpcode() == ISD::SRL &&
12868        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
12869         VT == MVT::v4i64 || VT == MVT::v8i32))
12870      return Op;
12871    if (Op.getOpcode() == ISD::SHL &&
12872        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
12873         VT == MVT::v4i64 || VT == MVT::v8i32))
12874      return Op;
12875    if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
12876      return Op;
12877  }
12878
12879  // Lower SHL with variable shift amount.
12880  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
12881    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
12882
12883    Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
12884    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
12885    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
12886    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
12887  }
12888  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
12889    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
12890
12891    // a = a << 5;
12892    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
12893    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
12894
12895    // Turn 'a' into a mask suitable for VSELECT
12896    SDValue VSelM = DAG.getConstant(0x80, VT);
12897    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
12898    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
12899
12900    SDValue CM1 = DAG.getConstant(0x0f, VT);
12901    SDValue CM2 = DAG.getConstant(0x3f, VT);
12902
12903    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
12904    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
12905    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
12906    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
12907    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
12908
12909    // a += a
12910    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
12911    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
12912    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
12913
12914    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
12915    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
12916    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
12917    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
12918    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
12919
12920    // a += a
12921    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
12922    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
12923    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
12924
12925    // return VSELECT(r, r+r, a);
12926    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
12927                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
12928    return R;
12929  }
12930
12931  // Decompose 256-bit shifts into smaller 128-bit shifts.
12932  if (VT.is256BitVector()) {
12933    unsigned NumElems = VT.getVectorNumElements();
12934    MVT EltVT = VT.getVectorElementType().getSimpleVT();
12935    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
12936
12937    // Extract the two vectors
12938    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
12939    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
12940
12941    // Recreate the shift amount vectors
12942    SDValue Amt1, Amt2;
12943    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
12944      // Constant shift amount
12945      SmallVector<SDValue, 4> Amt1Csts;
12946      SmallVector<SDValue, 4> Amt2Csts;
12947      for (unsigned i = 0; i != NumElems/2; ++i)
12948        Amt1Csts.push_back(Amt->getOperand(i));
12949      for (unsigned i = NumElems/2; i != NumElems; ++i)
12950        Amt2Csts.push_back(Amt->getOperand(i));
12951
12952      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
12953                                 &Amt1Csts[0], NumElems/2);
12954      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
12955                                 &Amt2Csts[0], NumElems/2);
12956    } else {
12957      // Variable shift amount
12958      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
12959      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
12960    }
12961
12962    // Issue new vector shifts for the smaller types
12963    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
12964    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
12965
12966    // Concatenate the result back
12967    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
12968  }
12969
12970  return SDValue();
12971}
12972
12973static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
12974  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
12975  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
12976  // looks for this combo and may remove the "setcc" instruction if the "setcc"
12977  // has only one use.
12978  SDNode *N = Op.getNode();
12979  SDValue LHS = N->getOperand(0);
12980  SDValue RHS = N->getOperand(1);
12981  unsigned BaseOp = 0;
12982  unsigned Cond = 0;
12983  SDLoc DL(Op);
12984  switch (Op.getOpcode()) {
12985  default: llvm_unreachable("Unknown ovf instruction!");
12986  case ISD::SADDO:
12987    // A subtract of one will be selected as a INC. Note that INC doesn't
12988    // set CF, so we can't do this for UADDO.
12989    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
12990      if (C->isOne()) {
12991        BaseOp = X86ISD::INC;
12992        Cond = X86::COND_O;
12993        break;
12994      }
12995    BaseOp = X86ISD::ADD;
12996    Cond = X86::COND_O;
12997    break;
12998  case ISD::UADDO:
12999    BaseOp = X86ISD::ADD;
13000    Cond = X86::COND_B;
13001    break;
13002  case ISD::SSUBO:
13003    // A subtract of one will be selected as a DEC. Note that DEC doesn't
13004    // set CF, so we can't do this for USUBO.
13005    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
13006      if (C->isOne()) {
13007        BaseOp = X86ISD::DEC;
13008        Cond = X86::COND_O;
13009        break;
13010      }
13011    BaseOp = X86ISD::SUB;
13012    Cond = X86::COND_O;
13013    break;
13014  case ISD::USUBO:
13015    BaseOp = X86ISD::SUB;
13016    Cond = X86::COND_B;
13017    break;
13018  case ISD::SMULO:
13019    BaseOp = X86ISD::SMUL;
13020    Cond = X86::COND_O;
13021    break;
13022  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
13023    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
13024                                 MVT::i32);
13025    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
13026
13027    SDValue SetCC =
13028      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
13029                  DAG.getConstant(X86::COND_O, MVT::i32),
13030                  SDValue(Sum.getNode(), 2));
13031
13032    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
13033  }
13034  }
13035
13036  // Also sets EFLAGS.
13037  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
13038  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
13039
13040  SDValue SetCC =
13041    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
13042                DAG.getConstant(Cond, MVT::i32),
13043                SDValue(Sum.getNode(), 1));
13044
13045  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
13046}
13047
13048SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
13049                                                  SelectionDAG &DAG) const {
13050  SDLoc dl(Op);
13051  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
13052  EVT VT = Op.getValueType();
13053
13054  if (!Subtarget->hasSSE2() || !VT.isVector())
13055    return SDValue();
13056
13057  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
13058                      ExtraVT.getScalarType().getSizeInBits();
13059
13060  switch (VT.getSimpleVT().SimpleTy) {
13061    default: return SDValue();
13062    case MVT::v8i32:
13063    case MVT::v16i16:
13064      if (!Subtarget->hasFp256())
13065        return SDValue();
13066      if (!Subtarget->hasInt256()) {
13067        // needs to be split
13068        unsigned NumElems = VT.getVectorNumElements();
13069
13070        // Extract the LHS vectors
13071        SDValue LHS = Op.getOperand(0);
13072        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
13073        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
13074
13075        MVT EltVT = VT.getVectorElementType().getSimpleVT();
13076        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
13077
13078        EVT ExtraEltVT = ExtraVT.getVectorElementType();
13079        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
13080        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
13081                                   ExtraNumElems/2);
13082        SDValue Extra = DAG.getValueType(ExtraVT);
13083
13084        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
13085        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
13086
13087        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
13088      }
13089      // fall through
13090    case MVT::v4i32:
13091    case MVT::v8i16: {
13092      // (sext (vzext x)) -> (vsext x)
13093      SDValue Op0 = Op.getOperand(0);
13094      SDValue Op00 = Op0.getOperand(0);
13095      SDValue Tmp1;
13096      // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
13097      if (Op0.getOpcode() == ISD::BITCAST &&
13098          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
13099        Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
13100      if (Tmp1.getNode()) {
13101        SDValue Tmp1Op0 = Tmp1.getOperand(0);
13102        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
13103               "This optimization is invalid without a VZEXT.");
13104        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
13105      }
13106
13107      // If the above didn't work, then just use Shift-Left + Shift-Right.
13108      Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
13109                                        DAG);
13110      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
13111                                        DAG);
13112    }
13113  }
13114}
13115
13116static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
13117                                 SelectionDAG &DAG) {
13118  SDLoc dl(Op);
13119  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
13120    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
13121  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
13122    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13123
13124  // The only fence that needs an instruction is a sequentially-consistent
13125  // cross-thread fence.
13126  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
13127    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
13128    // no-sse2). There isn't any reason to disable it if the target processor
13129    // supports it.
13130    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
13131      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
13132
13133    SDValue Chain = Op.getOperand(0);
13134    SDValue Zero = DAG.getConstant(0, MVT::i32);
13135    SDValue Ops[] = {
13136      DAG.getRegister(X86::ESP, MVT::i32), // Base
13137      DAG.getTargetConstant(1, MVT::i8),   // Scale
13138      DAG.getRegister(0, MVT::i32),        // Index
13139      DAG.getTargetConstant(0, MVT::i32),  // Disp
13140      DAG.getRegister(0, MVT::i32),        // Segment.
13141      Zero,
13142      Chain
13143    };
13144    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
13145    return SDValue(Res, 0);
13146  }
13147
13148  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
13149  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
13150}
13151
13152static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
13153                             SelectionDAG &DAG) {
13154  EVT T = Op.getValueType();
13155  SDLoc DL(Op);
13156  unsigned Reg = 0;
13157  unsigned size = 0;
13158  switch(T.getSimpleVT().SimpleTy) {
13159  default: llvm_unreachable("Invalid value type!");
13160  case MVT::i8:  Reg = X86::AL;  size = 1; break;
13161  case MVT::i16: Reg = X86::AX;  size = 2; break;
13162  case MVT::i32: Reg = X86::EAX; size = 4; break;
13163  case MVT::i64:
13164    assert(Subtarget->is64Bit() && "Node not type legal!");
13165    Reg = X86::RAX; size = 8;
13166    break;
13167  }
13168  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
13169                                    Op.getOperand(2), SDValue());
13170  SDValue Ops[] = { cpIn.getValue(0),
13171                    Op.getOperand(1),
13172                    Op.getOperand(3),
13173                    DAG.getTargetConstant(size, MVT::i8),
13174                    cpIn.getValue(1) };
13175  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13176  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
13177  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
13178                                           Ops, array_lengthof(Ops), T, MMO);
13179  SDValue cpOut =
13180    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
13181  return cpOut;
13182}
13183
13184static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
13185                                     SelectionDAG &DAG) {
13186  assert(Subtarget->is64Bit() && "Result not type legalized?");
13187  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13188  SDValue TheChain = Op.getOperand(0);
13189  SDLoc dl(Op);
13190  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
13191  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
13192  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
13193                                   rax.getValue(2));
13194  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
13195                            DAG.getConstant(32, MVT::i8));
13196  SDValue Ops[] = {
13197    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
13198    rdx.getValue(1)
13199  };
13200  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
13201}
13202
13203static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
13204                            SelectionDAG &DAG) {
13205  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13206  MVT DstVT = Op.getSimpleValueType();
13207  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
13208         Subtarget->hasMMX() && "Unexpected custom BITCAST");
13209  assert((DstVT == MVT::i64 ||
13210          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
13211         "Unexpected custom BITCAST");
13212  // i64 <=> MMX conversions are Legal.
13213  if (SrcVT==MVT::i64 && DstVT.isVector())
13214    return Op;
13215  if (DstVT==MVT::i64 && SrcVT.isVector())
13216    return Op;
13217  // MMX <=> MMX conversions are Legal.
13218  if (SrcVT.isVector() && DstVT.isVector())
13219    return Op;
13220  // All other conversions need to be expanded.
13221  return SDValue();
13222}
13223
13224static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
13225  SDNode *Node = Op.getNode();
13226  SDLoc dl(Node);
13227  EVT T = Node->getValueType(0);
13228  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
13229                              DAG.getConstant(0, T), Node->getOperand(2));
13230  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
13231                       cast<AtomicSDNode>(Node)->getMemoryVT(),
13232                       Node->getOperand(0),
13233                       Node->getOperand(1), negOp,
13234                       cast<AtomicSDNode>(Node)->getSrcValue(),
13235                       cast<AtomicSDNode>(Node)->getAlignment(),
13236                       cast<AtomicSDNode>(Node)->getOrdering(),
13237                       cast<AtomicSDNode>(Node)->getSynchScope());
13238}
13239
13240static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
13241  SDNode *Node = Op.getNode();
13242  SDLoc dl(Node);
13243  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
13244
13245  // Convert seq_cst store -> xchg
13246  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
13247  // FIXME: On 32-bit, store -> fist or movq would be more efficient
13248  //        (The only way to get a 16-byte store is cmpxchg16b)
13249  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
13250  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
13251      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
13252    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
13253                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
13254                                 Node->getOperand(0),
13255                                 Node->getOperand(1), Node->getOperand(2),
13256                                 cast<AtomicSDNode>(Node)->getMemOperand(),
13257                                 cast<AtomicSDNode>(Node)->getOrdering(),
13258                                 cast<AtomicSDNode>(Node)->getSynchScope());
13259    return Swap.getValue(1);
13260  }
13261  // Other atomic stores have a simple pattern.
13262  return Op;
13263}
13264
13265static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
13266  EVT VT = Op.getNode()->getValueType(0);
13267
13268  // Let legalize expand this if it isn't a legal type yet.
13269  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13270    return SDValue();
13271
13272  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
13273
13274  unsigned Opc;
13275  bool ExtraOp = false;
13276  switch (Op.getOpcode()) {
13277  default: llvm_unreachable("Invalid code");
13278  case ISD::ADDC: Opc = X86ISD::ADD; break;
13279  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
13280  case ISD::SUBC: Opc = X86ISD::SUB; break;
13281  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
13282  }
13283
13284  if (!ExtraOp)
13285    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
13286                       Op.getOperand(1));
13287  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
13288                     Op.getOperand(1), Op.getOperand(2));
13289}
13290
13291static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
13292                            SelectionDAG &DAG) {
13293  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
13294
13295  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
13296  // which returns the values as { float, float } (in XMM0) or
13297  // { double, double } (which is returned in XMM0, XMM1).
13298  SDLoc dl(Op);
13299  SDValue Arg = Op.getOperand(0);
13300  EVT ArgVT = Arg.getValueType();
13301  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
13302
13303  TargetLowering::ArgListTy Args;
13304  TargetLowering::ArgListEntry Entry;
13305
13306  Entry.Node = Arg;
13307  Entry.Ty = ArgTy;
13308  Entry.isSExt = false;
13309  Entry.isZExt = false;
13310  Args.push_back(Entry);
13311
13312  bool isF64 = ArgVT == MVT::f64;
13313  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
13314  // the small struct {f32, f32} is returned in (eax, edx). For f64,
13315  // the results are returned via SRet in memory.
13316  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
13317  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13318  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
13319
13320  Type *RetTy = isF64
13321    ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
13322    : (Type*)VectorType::get(ArgTy, 4);
13323  TargetLowering::
13324    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
13325                         false, false, false, false, 0,
13326                         CallingConv::C, /*isTaillCall=*/false,
13327                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
13328                         Callee, Args, DAG, dl);
13329  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
13330
13331  if (isF64)
13332    // Returned in xmm0 and xmm1.
13333    return CallResult.first;
13334
13335  // Returned in bits 0:31 and 32:64 xmm0.
13336  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
13337                               CallResult.first, DAG.getIntPtrConstant(0));
13338  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
13339                               CallResult.first, DAG.getIntPtrConstant(1));
13340  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
13341  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
13342}
13343
13344/// LowerOperation - Provide custom lowering hooks for some operations.
13345///
13346SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
13347  switch (Op.getOpcode()) {
13348  default: llvm_unreachable("Should not custom lower this!");
13349  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
13350  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
13351  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
13352  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
13353  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
13354  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
13355  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
13356  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
13357  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
13358  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
13359  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
13360  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
13361  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
13362  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
13363  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
13364  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
13365  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
13366  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
13367  case ISD::SHL_PARTS:
13368  case ISD::SRA_PARTS:
13369  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
13370  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
13371  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
13372  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
13373  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
13374  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
13375  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
13376  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
13377  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
13378  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
13379  case ISD::FABS:               return LowerFABS(Op, DAG);
13380  case ISD::FNEG:               return LowerFNEG(Op, DAG);
13381  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
13382  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
13383  case ISD::SETCC:              return LowerSETCC(Op, DAG);
13384  case ISD::SELECT:             return LowerSELECT(Op, DAG);
13385  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
13386  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
13387  case ISD::VASTART:            return LowerVASTART(Op, DAG);
13388  case ISD::VAARG:              return LowerVAARG(Op, DAG);
13389  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
13390  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
13391  case ISD::INTRINSIC_VOID:
13392  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
13393  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
13394  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
13395  case ISD::FRAME_TO_ARGS_OFFSET:
13396                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
13397  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
13398  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
13399  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
13400  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
13401  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
13402  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
13403  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
13404  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
13405  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
13406  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
13407  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
13408  case ISD::SRA:
13409  case ISD::SRL:
13410  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
13411  case ISD::SADDO:
13412  case ISD::UADDO:
13413  case ISD::SSUBO:
13414  case ISD::USUBO:
13415  case ISD::SMULO:
13416  case ISD::UMULO:              return LowerXALUO(Op, DAG);
13417  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
13418  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
13419  case ISD::ADDC:
13420  case ISD::ADDE:
13421  case ISD::SUBC:
13422  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
13423  case ISD::ADD:                return LowerADD(Op, DAG);
13424  case ISD::SUB:                return LowerSUB(Op, DAG);
13425  case ISD::SDIV:               return LowerSDIV(Op, DAG);
13426  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
13427  }
13428}
13429
13430static void ReplaceATOMIC_LOAD(SDNode *Node,
13431                                  SmallVectorImpl<SDValue> &Results,
13432                                  SelectionDAG &DAG) {
13433  SDLoc dl(Node);
13434  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
13435
13436  // Convert wide load -> cmpxchg8b/cmpxchg16b
13437  // FIXME: On 32-bit, load -> fild or movq would be more efficient
13438  //        (The only way to get a 16-byte load is cmpxchg16b)
13439  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
13440  SDValue Zero = DAG.getConstant(0, VT);
13441  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
13442                               Node->getOperand(0),
13443                               Node->getOperand(1), Zero, Zero,
13444                               cast<AtomicSDNode>(Node)->getMemOperand(),
13445                               cast<AtomicSDNode>(Node)->getOrdering(),
13446                               cast<AtomicSDNode>(Node)->getSynchScope());
13447  Results.push_back(Swap.getValue(0));
13448  Results.push_back(Swap.getValue(1));
13449}
13450
13451static void
13452ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
13453                        SelectionDAG &DAG, unsigned NewOp) {
13454  SDLoc dl(Node);
13455  assert (Node->getValueType(0) == MVT::i64 &&
13456          "Only know how to expand i64 atomics");
13457
13458  SDValue Chain = Node->getOperand(0);
13459  SDValue In1 = Node->getOperand(1);
13460  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
13461                             Node->getOperand(2), DAG.getIntPtrConstant(0));
13462  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
13463                             Node->getOperand(2), DAG.getIntPtrConstant(1));
13464  SDValue Ops[] = { Chain, In1, In2L, In2H };
13465  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
13466  SDValue Result =
13467    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
13468                            cast<MemSDNode>(Node)->getMemOperand());
13469  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
13470  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
13471  Results.push_back(Result.getValue(2));
13472}
13473
13474/// ReplaceNodeResults - Replace a node with an illegal result type
13475/// with a new node built out of custom code.
13476void X86TargetLowering::ReplaceNodeResults(SDNode *N,
13477                                           SmallVectorImpl<SDValue>&Results,
13478                                           SelectionDAG &DAG) const {
13479  SDLoc dl(N);
13480  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13481  switch (N->getOpcode()) {
13482  default:
13483    llvm_unreachable("Do not know how to custom type legalize this operation!");
13484  case ISD::SIGN_EXTEND_INREG:
13485  case ISD::ADDC:
13486  case ISD::ADDE:
13487  case ISD::SUBC:
13488  case ISD::SUBE:
13489    // We don't want to expand or promote these.
13490    return;
13491  case ISD::FP_TO_SINT:
13492  case ISD::FP_TO_UINT: {
13493    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
13494
13495    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
13496      return;
13497
13498    std::pair<SDValue,SDValue> Vals =
13499        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
13500    SDValue FIST = Vals.first, StackSlot = Vals.second;
13501    if (FIST.getNode() != 0) {
13502      EVT VT = N->getValueType(0);
13503      // Return a load from the stack slot.
13504      if (StackSlot.getNode() != 0)
13505        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
13506                                      MachinePointerInfo(),
13507                                      false, false, false, 0));
13508      else
13509        Results.push_back(FIST);
13510    }
13511    return;
13512  }
13513  case ISD::UINT_TO_FP: {
13514    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
13515    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
13516        N->getValueType(0) != MVT::v2f32)
13517      return;
13518    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
13519                                 N->getOperand(0));
13520    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13521                                     MVT::f64);
13522    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
13523    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
13524                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
13525    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
13526    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
13527    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
13528    return;
13529  }
13530  case ISD::FP_ROUND: {
13531    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
13532        return;
13533    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
13534    Results.push_back(V);
13535    return;
13536  }
13537  case ISD::READCYCLECOUNTER: {
13538    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13539    SDValue TheChain = N->getOperand(0);
13540    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
13541    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
13542                                     rd.getValue(1));
13543    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
13544                                     eax.getValue(2));
13545    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
13546    SDValue Ops[] = { eax, edx };
13547    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
13548                                  array_lengthof(Ops)));
13549    Results.push_back(edx.getValue(1));
13550    return;
13551  }
13552  case ISD::ATOMIC_CMP_SWAP: {
13553    EVT T = N->getValueType(0);
13554    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
13555    bool Regs64bit = T == MVT::i128;
13556    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
13557    SDValue cpInL, cpInH;
13558    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
13559                        DAG.getConstant(0, HalfT));
13560    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
13561                        DAG.getConstant(1, HalfT));
13562    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
13563                             Regs64bit ? X86::RAX : X86::EAX,
13564                             cpInL, SDValue());
13565    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
13566                             Regs64bit ? X86::RDX : X86::EDX,
13567                             cpInH, cpInL.getValue(1));
13568    SDValue swapInL, swapInH;
13569    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
13570                          DAG.getConstant(0, HalfT));
13571    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
13572                          DAG.getConstant(1, HalfT));
13573    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
13574                               Regs64bit ? X86::RBX : X86::EBX,
13575                               swapInL, cpInH.getValue(1));
13576    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
13577                               Regs64bit ? X86::RCX : X86::ECX,
13578                               swapInH, swapInL.getValue(1));
13579    SDValue Ops[] = { swapInH.getValue(0),
13580                      N->getOperand(1),
13581                      swapInH.getValue(1) };
13582    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
13583    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
13584    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
13585                                  X86ISD::LCMPXCHG8_DAG;
13586    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
13587                                             Ops, array_lengthof(Ops), T, MMO);
13588    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
13589                                        Regs64bit ? X86::RAX : X86::EAX,
13590                                        HalfT, Result.getValue(1));
13591    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
13592                                        Regs64bit ? X86::RDX : X86::EDX,
13593                                        HalfT, cpOutL.getValue(2));
13594    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
13595    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
13596    Results.push_back(cpOutH.getValue(1));
13597    return;
13598  }
13599  case ISD::ATOMIC_LOAD_ADD:
13600  case ISD::ATOMIC_LOAD_AND:
13601  case ISD::ATOMIC_LOAD_NAND:
13602  case ISD::ATOMIC_LOAD_OR:
13603  case ISD::ATOMIC_LOAD_SUB:
13604  case ISD::ATOMIC_LOAD_XOR:
13605  case ISD::ATOMIC_LOAD_MAX:
13606  case ISD::ATOMIC_LOAD_MIN:
13607  case ISD::ATOMIC_LOAD_UMAX:
13608  case ISD::ATOMIC_LOAD_UMIN:
13609  case ISD::ATOMIC_SWAP: {
13610    unsigned Opc;
13611    switch (N->getOpcode()) {
13612    default: llvm_unreachable("Unexpected opcode");
13613    case ISD::ATOMIC_LOAD_ADD:
13614      Opc = X86ISD::ATOMADD64_DAG;
13615      break;
13616    case ISD::ATOMIC_LOAD_AND:
13617      Opc = X86ISD::ATOMAND64_DAG;
13618      break;
13619    case ISD::ATOMIC_LOAD_NAND:
13620      Opc = X86ISD::ATOMNAND64_DAG;
13621      break;
13622    case ISD::ATOMIC_LOAD_OR:
13623      Opc = X86ISD::ATOMOR64_DAG;
13624      break;
13625    case ISD::ATOMIC_LOAD_SUB:
13626      Opc = X86ISD::ATOMSUB64_DAG;
13627      break;
13628    case ISD::ATOMIC_LOAD_XOR:
13629      Opc = X86ISD::ATOMXOR64_DAG;
13630      break;
13631    case ISD::ATOMIC_LOAD_MAX:
13632      Opc = X86ISD::ATOMMAX64_DAG;
13633      break;
13634    case ISD::ATOMIC_LOAD_MIN:
13635      Opc = X86ISD::ATOMMIN64_DAG;
13636      break;
13637    case ISD::ATOMIC_LOAD_UMAX:
13638      Opc = X86ISD::ATOMUMAX64_DAG;
13639      break;
13640    case ISD::ATOMIC_LOAD_UMIN:
13641      Opc = X86ISD::ATOMUMIN64_DAG;
13642      break;
13643    case ISD::ATOMIC_SWAP:
13644      Opc = X86ISD::ATOMSWAP64_DAG;
13645      break;
13646    }
13647    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
13648    return;
13649  }
13650  case ISD::ATOMIC_LOAD:
13651    ReplaceATOMIC_LOAD(N, Results, DAG);
13652  }
13653}
13654
13655const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
13656  switch (Opcode) {
13657  default: return NULL;
13658  case X86ISD::BSF:                return "X86ISD::BSF";
13659  case X86ISD::BSR:                return "X86ISD::BSR";
13660  case X86ISD::SHLD:               return "X86ISD::SHLD";
13661  case X86ISD::SHRD:               return "X86ISD::SHRD";
13662  case X86ISD::FAND:               return "X86ISD::FAND";
13663  case X86ISD::FANDN:              return "X86ISD::FANDN";
13664  case X86ISD::FOR:                return "X86ISD::FOR";
13665  case X86ISD::FXOR:               return "X86ISD::FXOR";
13666  case X86ISD::FSRL:               return "X86ISD::FSRL";
13667  case X86ISD::FILD:               return "X86ISD::FILD";
13668  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
13669  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
13670  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
13671  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
13672  case X86ISD::FLD:                return "X86ISD::FLD";
13673  case X86ISD::FST:                return "X86ISD::FST";
13674  case X86ISD::CALL:               return "X86ISD::CALL";
13675  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
13676  case X86ISD::BT:                 return "X86ISD::BT";
13677  case X86ISD::CMP:                return "X86ISD::CMP";
13678  case X86ISD::COMI:               return "X86ISD::COMI";
13679  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
13680  case X86ISD::CMPM:               return "X86ISD::CMPM";
13681  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
13682  case X86ISD::SETCC:              return "X86ISD::SETCC";
13683  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
13684  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
13685  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
13686  case X86ISD::CMOV:               return "X86ISD::CMOV";
13687  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
13688  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
13689  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
13690  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
13691  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
13692  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
13693  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
13694  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
13695  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
13696  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
13697  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
13698  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
13699  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
13700  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
13701  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
13702  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
13703  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
13704  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
13705  case X86ISD::HADD:               return "X86ISD::HADD";
13706  case X86ISD::HSUB:               return "X86ISD::HSUB";
13707  case X86ISD::FHADD:              return "X86ISD::FHADD";
13708  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
13709  case X86ISD::UMAX:               return "X86ISD::UMAX";
13710  case X86ISD::UMIN:               return "X86ISD::UMIN";
13711  case X86ISD::SMAX:               return "X86ISD::SMAX";
13712  case X86ISD::SMIN:               return "X86ISD::SMIN";
13713  case X86ISD::FMAX:               return "X86ISD::FMAX";
13714  case X86ISD::FMIN:               return "X86ISD::FMIN";
13715  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
13716  case X86ISD::FMINC:              return "X86ISD::FMINC";
13717  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
13718  case X86ISD::FRCP:               return "X86ISD::FRCP";
13719  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
13720  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
13721  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
13722  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
13723  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
13724  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
13725  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
13726  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
13727  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
13728  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
13729  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
13730  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
13731  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
13732  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
13733  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
13734  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
13735  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
13736  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
13737  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
13738  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
13739  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
13740  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
13741  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
13742  case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
13743  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
13744  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
13745  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
13746  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
13747  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
13748  case X86ISD::VSHL:               return "X86ISD::VSHL";
13749  case X86ISD::VSRL:               return "X86ISD::VSRL";
13750  case X86ISD::VSRA:               return "X86ISD::VSRA";
13751  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
13752  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
13753  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
13754  case X86ISD::CMPP:               return "X86ISD::CMPP";
13755  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
13756  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
13757  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
13758  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
13759  case X86ISD::ADD:                return "X86ISD::ADD";
13760  case X86ISD::SUB:                return "X86ISD::SUB";
13761  case X86ISD::ADC:                return "X86ISD::ADC";
13762  case X86ISD::SBB:                return "X86ISD::SBB";
13763  case X86ISD::SMUL:               return "X86ISD::SMUL";
13764  case X86ISD::UMUL:               return "X86ISD::UMUL";
13765  case X86ISD::INC:                return "X86ISD::INC";
13766  case X86ISD::DEC:                return "X86ISD::DEC";
13767  case X86ISD::OR:                 return "X86ISD::OR";
13768  case X86ISD::XOR:                return "X86ISD::XOR";
13769  case X86ISD::AND:                return "X86ISD::AND";
13770  case X86ISD::BLSI:               return "X86ISD::BLSI";
13771  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
13772  case X86ISD::BLSR:               return "X86ISD::BLSR";
13773  case X86ISD::BZHI:               return "X86ISD::BZHI";
13774  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
13775  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
13776  case X86ISD::PTEST:              return "X86ISD::PTEST";
13777  case X86ISD::TESTP:              return "X86ISD::TESTP";
13778  case X86ISD::TESTM:              return "X86ISD::TESTM";
13779  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
13780  case X86ISD::KTEST:              return "X86ISD::KTEST";
13781  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
13782  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
13783  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
13784  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
13785  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
13786  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
13787  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
13788  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
13789  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
13790  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
13791  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
13792  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
13793  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
13794  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
13795  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
13796  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
13797  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
13798  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
13799  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
13800  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
13801  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
13802  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
13803  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
13804  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
13805  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
13806  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
13807  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
13808  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
13809  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
13810  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
13811  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
13812  case X86ISD::SAHF:               return "X86ISD::SAHF";
13813  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
13814  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
13815  case X86ISD::FMADD:              return "X86ISD::FMADD";
13816  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
13817  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
13818  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
13819  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
13820  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
13821  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
13822  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
13823  case X86ISD::XTEST:              return "X86ISD::XTEST";
13824  }
13825}
13826
13827// isLegalAddressingMode - Return true if the addressing mode represented
13828// by AM is legal for this target, for a load/store of the specified type.
13829bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
13830                                              Type *Ty) const {
13831  // X86 supports extremely general addressing modes.
13832  CodeModel::Model M = getTargetMachine().getCodeModel();
13833  Reloc::Model R = getTargetMachine().getRelocationModel();
13834
13835  // X86 allows a sign-extended 32-bit immediate field as a displacement.
13836  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
13837    return false;
13838
13839  if (AM.BaseGV) {
13840    unsigned GVFlags =
13841      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
13842
13843    // If a reference to this global requires an extra load, we can't fold it.
13844    if (isGlobalStubReference(GVFlags))
13845      return false;
13846
13847    // If BaseGV requires a register for the PIC base, we cannot also have a
13848    // BaseReg specified.
13849    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
13850      return false;
13851
13852    // If lower 4G is not available, then we must use rip-relative addressing.
13853    if ((M != CodeModel::Small || R != Reloc::Static) &&
13854        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
13855      return false;
13856  }
13857
13858  switch (AM.Scale) {
13859  case 0:
13860  case 1:
13861  case 2:
13862  case 4:
13863  case 8:
13864    // These scales always work.
13865    break;
13866  case 3:
13867  case 5:
13868  case 9:
13869    // These scales are formed with basereg+scalereg.  Only accept if there is
13870    // no basereg yet.
13871    if (AM.HasBaseReg)
13872      return false;
13873    break;
13874  default:  // Other stuff never works.
13875    return false;
13876  }
13877
13878  return true;
13879}
13880
13881bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
13882  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13883    return false;
13884  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
13885  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
13886  return NumBits1 > NumBits2;
13887}
13888
13889bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
13890  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13891    return false;
13892
13893  if (!isTypeLegal(EVT::getEVT(Ty1)))
13894    return false;
13895
13896  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
13897
13898  // Assuming the caller doesn't have a zeroext or signext return parameter,
13899  // truncation all the way down to i1 is valid.
13900  return true;
13901}
13902
13903bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
13904  return isInt<32>(Imm);
13905}
13906
13907bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
13908  // Can also use sub to handle negated immediates.
13909  return isInt<32>(Imm);
13910}
13911
13912bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
13913  if (!VT1.isInteger() || !VT2.isInteger())
13914    return false;
13915  unsigned NumBits1 = VT1.getSizeInBits();
13916  unsigned NumBits2 = VT2.getSizeInBits();
13917  return NumBits1 > NumBits2;
13918}
13919
13920bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
13921  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
13922  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
13923}
13924
13925bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
13926  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
13927  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
13928}
13929
13930bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
13931  EVT VT1 = Val.getValueType();
13932  if (isZExtFree(VT1, VT2))
13933    return true;
13934
13935  if (Val.getOpcode() != ISD::LOAD)
13936    return false;
13937
13938  if (!VT1.isSimple() || !VT1.isInteger() ||
13939      !VT2.isSimple() || !VT2.isInteger())
13940    return false;
13941
13942  switch (VT1.getSimpleVT().SimpleTy) {
13943  default: break;
13944  case MVT::i8:
13945  case MVT::i16:
13946  case MVT::i32:
13947    // X86 has 8, 16, and 32-bit zero-extending loads.
13948    return true;
13949  }
13950
13951  return false;
13952}
13953
13954bool
13955X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
13956  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
13957    return false;
13958
13959  VT = VT.getScalarType();
13960
13961  if (!VT.isSimple())
13962    return false;
13963
13964  switch (VT.getSimpleVT().SimpleTy) {
13965  case MVT::f32:
13966  case MVT::f64:
13967    return true;
13968  default:
13969    break;
13970  }
13971
13972  return false;
13973}
13974
13975bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
13976  // i16 instructions are longer (0x66 prefix) and potentially slower.
13977  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
13978}
13979
13980/// isShuffleMaskLegal - Targets can use this to indicate that they only
13981/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
13982/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
13983/// are assumed to be legal.
13984bool
13985X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
13986                                      EVT VT) const {
13987  if (!VT.isSimple())
13988    return false;
13989
13990  MVT SVT = VT.getSimpleVT();
13991
13992  // Very little shuffling can be done for 64-bit vectors right now.
13993  if (VT.getSizeInBits() == 64)
13994    return false;
13995
13996  // FIXME: pshufb, blends, shifts.
13997  return (SVT.getVectorNumElements() == 2 ||
13998          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
13999          isMOVLMask(M, SVT) ||
14000          isSHUFPMask(M, SVT) ||
14001          isPSHUFDMask(M, SVT) ||
14002          isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
14003          isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
14004          isPALIGNRMask(M, SVT, Subtarget) ||
14005          isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
14006          isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
14007          isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
14008          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
14009}
14010
14011bool
14012X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
14013                                          EVT VT) const {
14014  if (!VT.isSimple())
14015    return false;
14016
14017  MVT SVT = VT.getSimpleVT();
14018  unsigned NumElts = SVT.getVectorNumElements();
14019  // FIXME: This collection of masks seems suspect.
14020  if (NumElts == 2)
14021    return true;
14022  if (NumElts == 4 && SVT.is128BitVector()) {
14023    return (isMOVLMask(Mask, SVT)  ||
14024            isCommutedMOVLMask(Mask, SVT, true) ||
14025            isSHUFPMask(Mask, SVT) ||
14026            isSHUFPMask(Mask, SVT, /* Commuted */ true));
14027  }
14028  return false;
14029}
14030
14031//===----------------------------------------------------------------------===//
14032//                           X86 Scheduler Hooks
14033//===----------------------------------------------------------------------===//
14034
14035/// Utility function to emit xbegin specifying the start of an RTM region.
14036static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
14037                                     const TargetInstrInfo *TII) {
14038  DebugLoc DL = MI->getDebugLoc();
14039
14040  const BasicBlock *BB = MBB->getBasicBlock();
14041  MachineFunction::iterator I = MBB;
14042  ++I;
14043
14044  // For the v = xbegin(), we generate
14045  //
14046  // thisMBB:
14047  //  xbegin sinkMBB
14048  //
14049  // mainMBB:
14050  //  eax = -1
14051  //
14052  // sinkMBB:
14053  //  v = eax
14054
14055  MachineBasicBlock *thisMBB = MBB;
14056  MachineFunction *MF = MBB->getParent();
14057  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14058  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14059  MF->insert(I, mainMBB);
14060  MF->insert(I, sinkMBB);
14061
14062  // Transfer the remainder of BB and its successor edges to sinkMBB.
14063  sinkMBB->splice(sinkMBB->begin(), MBB,
14064                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
14065  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14066
14067  // thisMBB:
14068  //  xbegin sinkMBB
14069  //  # fallthrough to mainMBB
14070  //  # abortion to sinkMBB
14071  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
14072  thisMBB->addSuccessor(mainMBB);
14073  thisMBB->addSuccessor(sinkMBB);
14074
14075  // mainMBB:
14076  //  EAX = -1
14077  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
14078  mainMBB->addSuccessor(sinkMBB);
14079
14080  // sinkMBB:
14081  // EAX is live into the sinkMBB
14082  sinkMBB->addLiveIn(X86::EAX);
14083  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14084          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
14085    .addReg(X86::EAX);
14086
14087  MI->eraseFromParent();
14088  return sinkMBB;
14089}
14090
14091// Get CMPXCHG opcode for the specified data type.
14092static unsigned getCmpXChgOpcode(EVT VT) {
14093  switch (VT.getSimpleVT().SimpleTy) {
14094  case MVT::i8:  return X86::LCMPXCHG8;
14095  case MVT::i16: return X86::LCMPXCHG16;
14096  case MVT::i32: return X86::LCMPXCHG32;
14097  case MVT::i64: return X86::LCMPXCHG64;
14098  default:
14099    break;
14100  }
14101  llvm_unreachable("Invalid operand size!");
14102}
14103
14104// Get LOAD opcode for the specified data type.
14105static unsigned getLoadOpcode(EVT VT) {
14106  switch (VT.getSimpleVT().SimpleTy) {
14107  case MVT::i8:  return X86::MOV8rm;
14108  case MVT::i16: return X86::MOV16rm;
14109  case MVT::i32: return X86::MOV32rm;
14110  case MVT::i64: return X86::MOV64rm;
14111  default:
14112    break;
14113  }
14114  llvm_unreachable("Invalid operand size!");
14115}
14116
14117// Get opcode of the non-atomic one from the specified atomic instruction.
14118static unsigned getNonAtomicOpcode(unsigned Opc) {
14119  switch (Opc) {
14120  case X86::ATOMAND8:  return X86::AND8rr;
14121  case X86::ATOMAND16: return X86::AND16rr;
14122  case X86::ATOMAND32: return X86::AND32rr;
14123  case X86::ATOMAND64: return X86::AND64rr;
14124  case X86::ATOMOR8:   return X86::OR8rr;
14125  case X86::ATOMOR16:  return X86::OR16rr;
14126  case X86::ATOMOR32:  return X86::OR32rr;
14127  case X86::ATOMOR64:  return X86::OR64rr;
14128  case X86::ATOMXOR8:  return X86::XOR8rr;
14129  case X86::ATOMXOR16: return X86::XOR16rr;
14130  case X86::ATOMXOR32: return X86::XOR32rr;
14131  case X86::ATOMXOR64: return X86::XOR64rr;
14132  }
14133  llvm_unreachable("Unhandled atomic-load-op opcode!");
14134}
14135
14136// Get opcode of the non-atomic one from the specified atomic instruction with
14137// extra opcode.
14138static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
14139                                               unsigned &ExtraOpc) {
14140  switch (Opc) {
14141  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
14142  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
14143  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
14144  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
14145  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
14146  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
14147  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
14148  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
14149  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
14150  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
14151  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
14152  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
14153  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
14154  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
14155  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
14156  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
14157  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
14158  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
14159  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
14160  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
14161  }
14162  llvm_unreachable("Unhandled atomic-load-op opcode!");
14163}
14164
14165// Get opcode of the non-atomic one from the specified atomic instruction for
14166// 64-bit data type on 32-bit target.
14167static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
14168  switch (Opc) {
14169  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
14170  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
14171  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
14172  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
14173  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
14174  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
14175  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
14176  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
14177  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
14178  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
14179  }
14180  llvm_unreachable("Unhandled atomic-load-op opcode!");
14181}
14182
14183// Get opcode of the non-atomic one from the specified atomic instruction for
14184// 64-bit data type on 32-bit target with extra opcode.
14185static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
14186                                                   unsigned &HiOpc,
14187                                                   unsigned &ExtraOpc) {
14188  switch (Opc) {
14189  case X86::ATOMNAND6432:
14190    ExtraOpc = X86::NOT32r;
14191    HiOpc = X86::AND32rr;
14192    return X86::AND32rr;
14193  }
14194  llvm_unreachable("Unhandled atomic-load-op opcode!");
14195}
14196
14197// Get pseudo CMOV opcode from the specified data type.
14198static unsigned getPseudoCMOVOpc(EVT VT) {
14199  switch (VT.getSimpleVT().SimpleTy) {
14200  case MVT::i8:  return X86::CMOV_GR8;
14201  case MVT::i16: return X86::CMOV_GR16;
14202  case MVT::i32: return X86::CMOV_GR32;
14203  default:
14204    break;
14205  }
14206  llvm_unreachable("Unknown CMOV opcode!");
14207}
14208
14209// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
14210// They will be translated into a spin-loop or compare-exchange loop from
14211//
14212//    ...
14213//    dst = atomic-fetch-op MI.addr, MI.val
14214//    ...
14215//
14216// to
14217//
14218//    ...
14219//    t1 = LOAD MI.addr
14220// loop:
14221//    t4 = phi(t1, t3 / loop)
14222//    t2 = OP MI.val, t4
14223//    EAX = t4
14224//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
14225//    t3 = EAX
14226//    JNE loop
14227// sink:
14228//    dst = t3
14229//    ...
14230MachineBasicBlock *
14231X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
14232                                       MachineBasicBlock *MBB) const {
14233  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14234  DebugLoc DL = MI->getDebugLoc();
14235
14236  MachineFunction *MF = MBB->getParent();
14237  MachineRegisterInfo &MRI = MF->getRegInfo();
14238
14239  const BasicBlock *BB = MBB->getBasicBlock();
14240  MachineFunction::iterator I = MBB;
14241  ++I;
14242
14243  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
14244         "Unexpected number of operands");
14245
14246  assert(MI->hasOneMemOperand() &&
14247         "Expected atomic-load-op to have one memoperand");
14248
14249  // Memory Reference
14250  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
14251  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
14252
14253  unsigned DstReg, SrcReg;
14254  unsigned MemOpndSlot;
14255
14256  unsigned CurOp = 0;
14257
14258  DstReg = MI->getOperand(CurOp++).getReg();
14259  MemOpndSlot = CurOp;
14260  CurOp += X86::AddrNumOperands;
14261  SrcReg = MI->getOperand(CurOp++).getReg();
14262
14263  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
14264  MVT::SimpleValueType VT = *RC->vt_begin();
14265  unsigned t1 = MRI.createVirtualRegister(RC);
14266  unsigned t2 = MRI.createVirtualRegister(RC);
14267  unsigned t3 = MRI.createVirtualRegister(RC);
14268  unsigned t4 = MRI.createVirtualRegister(RC);
14269  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
14270
14271  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
14272  unsigned LOADOpc = getLoadOpcode(VT);
14273
14274  // For the atomic load-arith operator, we generate
14275  //
14276  //  thisMBB:
14277  //    t1 = LOAD [MI.addr]
14278  //  mainMBB:
14279  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
14280  //    t1 = OP MI.val, EAX
14281  //    EAX = t4
14282  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
14283  //    t3 = EAX
14284  //    JNE mainMBB
14285  //  sinkMBB:
14286  //    dst = t3
14287
14288  MachineBasicBlock *thisMBB = MBB;
14289  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14290  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14291  MF->insert(I, mainMBB);
14292  MF->insert(I, sinkMBB);
14293
14294  MachineInstrBuilder MIB;
14295
14296  // Transfer the remainder of BB and its successor edges to sinkMBB.
14297  sinkMBB->splice(sinkMBB->begin(), MBB,
14298                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
14299  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14300
14301  // thisMBB:
14302  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
14303  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14304    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14305    if (NewMO.isReg())
14306      NewMO.setIsKill(false);
14307    MIB.addOperand(NewMO);
14308  }
14309  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
14310    unsigned flags = (*MMOI)->getFlags();
14311    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
14312    MachineMemOperand *MMO =
14313      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
14314                               (*MMOI)->getSize(),
14315                               (*MMOI)->getBaseAlignment(),
14316                               (*MMOI)->getTBAAInfo(),
14317                               (*MMOI)->getRanges());
14318    MIB.addMemOperand(MMO);
14319  }
14320
14321  thisMBB->addSuccessor(mainMBB);
14322
14323  // mainMBB:
14324  MachineBasicBlock *origMainMBB = mainMBB;
14325
14326  // Add a PHI.
14327  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
14328                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
14329
14330  unsigned Opc = MI->getOpcode();
14331  switch (Opc) {
14332  default:
14333    llvm_unreachable("Unhandled atomic-load-op opcode!");
14334  case X86::ATOMAND8:
14335  case X86::ATOMAND16:
14336  case X86::ATOMAND32:
14337  case X86::ATOMAND64:
14338  case X86::ATOMOR8:
14339  case X86::ATOMOR16:
14340  case X86::ATOMOR32:
14341  case X86::ATOMOR64:
14342  case X86::ATOMXOR8:
14343  case X86::ATOMXOR16:
14344  case X86::ATOMXOR32:
14345  case X86::ATOMXOR64: {
14346    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
14347    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
14348      .addReg(t4);
14349    break;
14350  }
14351  case X86::ATOMNAND8:
14352  case X86::ATOMNAND16:
14353  case X86::ATOMNAND32:
14354  case X86::ATOMNAND64: {
14355    unsigned Tmp = MRI.createVirtualRegister(RC);
14356    unsigned NOTOpc;
14357    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
14358    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
14359      .addReg(t4);
14360    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
14361    break;
14362  }
14363  case X86::ATOMMAX8:
14364  case X86::ATOMMAX16:
14365  case X86::ATOMMAX32:
14366  case X86::ATOMMAX64:
14367  case X86::ATOMMIN8:
14368  case X86::ATOMMIN16:
14369  case X86::ATOMMIN32:
14370  case X86::ATOMMIN64:
14371  case X86::ATOMUMAX8:
14372  case X86::ATOMUMAX16:
14373  case X86::ATOMUMAX32:
14374  case X86::ATOMUMAX64:
14375  case X86::ATOMUMIN8:
14376  case X86::ATOMUMIN16:
14377  case X86::ATOMUMIN32:
14378  case X86::ATOMUMIN64: {
14379    unsigned CMPOpc;
14380    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
14381
14382    BuildMI(mainMBB, DL, TII->get(CMPOpc))
14383      .addReg(SrcReg)
14384      .addReg(t4);
14385
14386    if (Subtarget->hasCMov()) {
14387      if (VT != MVT::i8) {
14388        // Native support
14389        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
14390          .addReg(SrcReg)
14391          .addReg(t4);
14392      } else {
14393        // Promote i8 to i32 to use CMOV32
14394        const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
14395        const TargetRegisterClass *RC32 =
14396          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
14397        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
14398        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
14399        unsigned Tmp = MRI.createVirtualRegister(RC32);
14400
14401        unsigned Undef = MRI.createVirtualRegister(RC32);
14402        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
14403
14404        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
14405          .addReg(Undef)
14406          .addReg(SrcReg)
14407          .addImm(X86::sub_8bit);
14408        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
14409          .addReg(Undef)
14410          .addReg(t4)
14411          .addImm(X86::sub_8bit);
14412
14413        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
14414          .addReg(SrcReg32)
14415          .addReg(AccReg32);
14416
14417        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
14418          .addReg(Tmp, 0, X86::sub_8bit);
14419      }
14420    } else {
14421      // Use pseudo select and lower them.
14422      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
14423             "Invalid atomic-load-op transformation!");
14424      unsigned SelOpc = getPseudoCMOVOpc(VT);
14425      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
14426      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
14427      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
14428              .addReg(SrcReg).addReg(t4)
14429              .addImm(CC);
14430      mainMBB = EmitLoweredSelect(MIB, mainMBB);
14431      // Replace the original PHI node as mainMBB is changed after CMOV
14432      // lowering.
14433      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
14434        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
14435      Phi->eraseFromParent();
14436    }
14437    break;
14438  }
14439  }
14440
14441  // Copy PhyReg back from virtual register.
14442  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
14443    .addReg(t4);
14444
14445  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
14446  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14447    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14448    if (NewMO.isReg())
14449      NewMO.setIsKill(false);
14450    MIB.addOperand(NewMO);
14451  }
14452  MIB.addReg(t2);
14453  MIB.setMemRefs(MMOBegin, MMOEnd);
14454
14455  // Copy PhyReg back to virtual register.
14456  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
14457    .addReg(PhyReg);
14458
14459  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
14460
14461  mainMBB->addSuccessor(origMainMBB);
14462  mainMBB->addSuccessor(sinkMBB);
14463
14464  // sinkMBB:
14465  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14466          TII->get(TargetOpcode::COPY), DstReg)
14467    .addReg(t3);
14468
14469  MI->eraseFromParent();
14470  return sinkMBB;
14471}
14472
14473// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
14474// instructions. They will be translated into a spin-loop or compare-exchange
14475// loop from
14476//
14477//    ...
14478//    dst = atomic-fetch-op MI.addr, MI.val
14479//    ...
14480//
14481// to
14482//
14483//    ...
14484//    t1L = LOAD [MI.addr + 0]
14485//    t1H = LOAD [MI.addr + 4]
14486// loop:
14487//    t4L = phi(t1L, t3L / loop)
14488//    t4H = phi(t1H, t3H / loop)
14489//    t2L = OP MI.val.lo, t4L
14490//    t2H = OP MI.val.hi, t4H
14491//    EAX = t4L
14492//    EDX = t4H
14493//    EBX = t2L
14494//    ECX = t2H
14495//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
14496//    t3L = EAX
14497//    t3H = EDX
14498//    JNE loop
14499// sink:
14500//    dstL = t3L
14501//    dstH = t3H
14502//    ...
14503MachineBasicBlock *
14504X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
14505                                           MachineBasicBlock *MBB) const {
14506  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14507  DebugLoc DL = MI->getDebugLoc();
14508
14509  MachineFunction *MF = MBB->getParent();
14510  MachineRegisterInfo &MRI = MF->getRegInfo();
14511
14512  const BasicBlock *BB = MBB->getBasicBlock();
14513  MachineFunction::iterator I = MBB;
14514  ++I;
14515
14516  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
14517         "Unexpected number of operands");
14518
14519  assert(MI->hasOneMemOperand() &&
14520         "Expected atomic-load-op32 to have one memoperand");
14521
14522  // Memory Reference
14523  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
14524  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
14525
14526  unsigned DstLoReg, DstHiReg;
14527  unsigned SrcLoReg, SrcHiReg;
14528  unsigned MemOpndSlot;
14529
14530  unsigned CurOp = 0;
14531
14532  DstLoReg = MI->getOperand(CurOp++).getReg();
14533  DstHiReg = MI->getOperand(CurOp++).getReg();
14534  MemOpndSlot = CurOp;
14535  CurOp += X86::AddrNumOperands;
14536  SrcLoReg = MI->getOperand(CurOp++).getReg();
14537  SrcHiReg = MI->getOperand(CurOp++).getReg();
14538
14539  const TargetRegisterClass *RC = &X86::GR32RegClass;
14540  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
14541
14542  unsigned t1L = MRI.createVirtualRegister(RC);
14543  unsigned t1H = MRI.createVirtualRegister(RC);
14544  unsigned t2L = MRI.createVirtualRegister(RC);
14545  unsigned t2H = MRI.createVirtualRegister(RC);
14546  unsigned t3L = MRI.createVirtualRegister(RC);
14547  unsigned t3H = MRI.createVirtualRegister(RC);
14548  unsigned t4L = MRI.createVirtualRegister(RC);
14549  unsigned t4H = MRI.createVirtualRegister(RC);
14550
14551  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
14552  unsigned LOADOpc = X86::MOV32rm;
14553
14554  // For the atomic load-arith operator, we generate
14555  //
14556  //  thisMBB:
14557  //    t1L = LOAD [MI.addr + 0]
14558  //    t1H = LOAD [MI.addr + 4]
14559  //  mainMBB:
14560  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
14561  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
14562  //    t2L = OP MI.val.lo, t4L
14563  //    t2H = OP MI.val.hi, t4H
14564  //    EBX = t2L
14565  //    ECX = t2H
14566  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
14567  //    t3L = EAX
14568  //    t3H = EDX
14569  //    JNE loop
14570  //  sinkMBB:
14571  //    dstL = t3L
14572  //    dstH = t3H
14573
14574  MachineBasicBlock *thisMBB = MBB;
14575  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
14576  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
14577  MF->insert(I, mainMBB);
14578  MF->insert(I, sinkMBB);
14579
14580  MachineInstrBuilder MIB;
14581
14582  // Transfer the remainder of BB and its successor edges to sinkMBB.
14583  sinkMBB->splice(sinkMBB->begin(), MBB,
14584                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
14585  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
14586
14587  // thisMBB:
14588  // Lo
14589  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
14590  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14591    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14592    if (NewMO.isReg())
14593      NewMO.setIsKill(false);
14594    MIB.addOperand(NewMO);
14595  }
14596  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
14597    unsigned flags = (*MMOI)->getFlags();
14598    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
14599    MachineMemOperand *MMO =
14600      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
14601                               (*MMOI)->getSize(),
14602                               (*MMOI)->getBaseAlignment(),
14603                               (*MMOI)->getTBAAInfo(),
14604                               (*MMOI)->getRanges());
14605    MIB.addMemOperand(MMO);
14606  };
14607  MachineInstr *LowMI = MIB;
14608
14609  // Hi
14610  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
14611  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14612    if (i == X86::AddrDisp) {
14613      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
14614    } else {
14615      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14616      if (NewMO.isReg())
14617        NewMO.setIsKill(false);
14618      MIB.addOperand(NewMO);
14619    }
14620  }
14621  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
14622
14623  thisMBB->addSuccessor(mainMBB);
14624
14625  // mainMBB:
14626  MachineBasicBlock *origMainMBB = mainMBB;
14627
14628  // Add PHIs.
14629  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
14630                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
14631  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
14632                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
14633
14634  unsigned Opc = MI->getOpcode();
14635  switch (Opc) {
14636  default:
14637    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
14638  case X86::ATOMAND6432:
14639  case X86::ATOMOR6432:
14640  case X86::ATOMXOR6432:
14641  case X86::ATOMADD6432:
14642  case X86::ATOMSUB6432: {
14643    unsigned HiOpc;
14644    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
14645    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
14646      .addReg(SrcLoReg);
14647    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
14648      .addReg(SrcHiReg);
14649    break;
14650  }
14651  case X86::ATOMNAND6432: {
14652    unsigned HiOpc, NOTOpc;
14653    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
14654    unsigned TmpL = MRI.createVirtualRegister(RC);
14655    unsigned TmpH = MRI.createVirtualRegister(RC);
14656    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
14657      .addReg(t4L);
14658    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
14659      .addReg(t4H);
14660    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
14661    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
14662    break;
14663  }
14664  case X86::ATOMMAX6432:
14665  case X86::ATOMMIN6432:
14666  case X86::ATOMUMAX6432:
14667  case X86::ATOMUMIN6432: {
14668    unsigned HiOpc;
14669    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
14670    unsigned cL = MRI.createVirtualRegister(RC8);
14671    unsigned cH = MRI.createVirtualRegister(RC8);
14672    unsigned cL32 = MRI.createVirtualRegister(RC);
14673    unsigned cH32 = MRI.createVirtualRegister(RC);
14674    unsigned cc = MRI.createVirtualRegister(RC);
14675    // cl := cmp src_lo, lo
14676    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
14677      .addReg(SrcLoReg).addReg(t4L);
14678    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
14679    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
14680    // ch := cmp src_hi, hi
14681    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
14682      .addReg(SrcHiReg).addReg(t4H);
14683    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
14684    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
14685    // cc := if (src_hi == hi) ? cl : ch;
14686    if (Subtarget->hasCMov()) {
14687      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
14688        .addReg(cH32).addReg(cL32);
14689    } else {
14690      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
14691              .addReg(cH32).addReg(cL32)
14692              .addImm(X86::COND_E);
14693      mainMBB = EmitLoweredSelect(MIB, mainMBB);
14694    }
14695    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
14696    if (Subtarget->hasCMov()) {
14697      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
14698        .addReg(SrcLoReg).addReg(t4L);
14699      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
14700        .addReg(SrcHiReg).addReg(t4H);
14701    } else {
14702      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
14703              .addReg(SrcLoReg).addReg(t4L)
14704              .addImm(X86::COND_NE);
14705      mainMBB = EmitLoweredSelect(MIB, mainMBB);
14706      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
14707      // 2nd CMOV lowering.
14708      mainMBB->addLiveIn(X86::EFLAGS);
14709      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
14710              .addReg(SrcHiReg).addReg(t4H)
14711              .addImm(X86::COND_NE);
14712      mainMBB = EmitLoweredSelect(MIB, mainMBB);
14713      // Replace the original PHI node as mainMBB is changed after CMOV
14714      // lowering.
14715      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
14716        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
14717      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
14718        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
14719      PhiL->eraseFromParent();
14720      PhiH->eraseFromParent();
14721    }
14722    break;
14723  }
14724  case X86::ATOMSWAP6432: {
14725    unsigned HiOpc;
14726    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
14727    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
14728    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
14729    break;
14730  }
14731  }
14732
14733  // Copy EDX:EAX back from HiReg:LoReg
14734  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
14735  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
14736  // Copy ECX:EBX from t1H:t1L
14737  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
14738  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
14739
14740  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
14741  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
14742    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
14743    if (NewMO.isReg())
14744      NewMO.setIsKill(false);
14745    MIB.addOperand(NewMO);
14746  }
14747  MIB.setMemRefs(MMOBegin, MMOEnd);
14748
14749  // Copy EDX:EAX back to t3H:t3L
14750  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
14751  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
14752
14753  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
14754
14755  mainMBB->addSuccessor(origMainMBB);
14756  mainMBB->addSuccessor(sinkMBB);
14757
14758  // sinkMBB:
14759  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14760          TII->get(TargetOpcode::COPY), DstLoReg)
14761    .addReg(t3L);
14762  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
14763          TII->get(TargetOpcode::COPY), DstHiReg)
14764    .addReg(t3H);
14765
14766  MI->eraseFromParent();
14767  return sinkMBB;
14768}
14769
14770// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
14771// or XMM0_V32I8 in AVX all of this code can be replaced with that
14772// in the .td file.
14773static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
14774                                       const TargetInstrInfo *TII) {
14775  unsigned Opc;
14776  switch (MI->getOpcode()) {
14777  default: llvm_unreachable("illegal opcode!");
14778  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
14779  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
14780  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
14781  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
14782  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
14783  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
14784  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
14785  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
14786  }
14787
14788  DebugLoc dl = MI->getDebugLoc();
14789  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
14790
14791  unsigned NumArgs = MI->getNumOperands();
14792  for (unsigned i = 1; i < NumArgs; ++i) {
14793    MachineOperand &Op = MI->getOperand(i);
14794    if (!(Op.isReg() && Op.isImplicit()))
14795      MIB.addOperand(Op);
14796  }
14797  if (MI->hasOneMemOperand())
14798    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
14799
14800  BuildMI(*BB, MI, dl,
14801    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
14802    .addReg(X86::XMM0);
14803
14804  MI->eraseFromParent();
14805  return BB;
14806}
14807
14808// FIXME: Custom handling because TableGen doesn't support multiple implicit
14809// defs in an instruction pattern
14810static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
14811                                       const TargetInstrInfo *TII) {
14812  unsigned Opc;
14813  switch (MI->getOpcode()) {
14814  default: llvm_unreachable("illegal opcode!");
14815  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
14816  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
14817  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
14818  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
14819  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
14820  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
14821  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
14822  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
14823  }
14824
14825  DebugLoc dl = MI->getDebugLoc();
14826  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
14827
14828  unsigned NumArgs = MI->getNumOperands(); // remove the results
14829  for (unsigned i = 1; i < NumArgs; ++i) {
14830    MachineOperand &Op = MI->getOperand(i);
14831    if (!(Op.isReg() && Op.isImplicit()))
14832      MIB.addOperand(Op);
14833  }
14834  if (MI->hasOneMemOperand())
14835    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
14836
14837  BuildMI(*BB, MI, dl,
14838    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
14839    .addReg(X86::ECX);
14840
14841  MI->eraseFromParent();
14842  return BB;
14843}
14844
14845static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
14846                                       const TargetInstrInfo *TII,
14847                                       const X86Subtarget* Subtarget) {
14848  DebugLoc dl = MI->getDebugLoc();
14849
14850  // Address into RAX/EAX, other two args into ECX, EDX.
14851  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
14852  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
14853  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
14854  for (int i = 0; i < X86::AddrNumOperands; ++i)
14855    MIB.addOperand(MI->getOperand(i));
14856
14857  unsigned ValOps = X86::AddrNumOperands;
14858  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
14859    .addReg(MI->getOperand(ValOps).getReg());
14860  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
14861    .addReg(MI->getOperand(ValOps+1).getReg());
14862
14863  // The instruction doesn't actually take any operands though.
14864  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
14865
14866  MI->eraseFromParent(); // The pseudo is gone now.
14867  return BB;
14868}
14869
14870MachineBasicBlock *
14871X86TargetLowering::EmitVAARG64WithCustomInserter(
14872                   MachineInstr *MI,
14873                   MachineBasicBlock *MBB) const {
14874  // Emit va_arg instruction on X86-64.
14875
14876  // Operands to this pseudo-instruction:
14877  // 0  ) Output        : destination address (reg)
14878  // 1-5) Input         : va_list address (addr, i64mem)
14879  // 6  ) ArgSize       : Size (in bytes) of vararg type
14880  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
14881  // 8  ) Align         : Alignment of type
14882  // 9  ) EFLAGS (implicit-def)
14883
14884  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
14885  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
14886
14887  unsigned DestReg = MI->getOperand(0).getReg();
14888  MachineOperand &Base = MI->getOperand(1);
14889  MachineOperand &Scale = MI->getOperand(2);
14890  MachineOperand &Index = MI->getOperand(3);
14891  MachineOperand &Disp = MI->getOperand(4);
14892  MachineOperand &Segment = MI->getOperand(5);
14893  unsigned ArgSize = MI->getOperand(6).getImm();
14894  unsigned ArgMode = MI->getOperand(7).getImm();
14895  unsigned Align = MI->getOperand(8).getImm();
14896
14897  // Memory Reference
14898  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
14899  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
14900  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
14901
14902  // Machine Information
14903  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14904  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
14905  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
14906  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
14907  DebugLoc DL = MI->getDebugLoc();
14908
14909  // struct va_list {
14910  //   i32   gp_offset
14911  //   i32   fp_offset
14912  //   i64   overflow_area (address)
14913  //   i64   reg_save_area (address)
14914  // }
14915  // sizeof(va_list) = 24
14916  // alignment(va_list) = 8
14917
14918  unsigned TotalNumIntRegs = 6;
14919  unsigned TotalNumXMMRegs = 8;
14920  bool UseGPOffset = (ArgMode == 1);
14921  bool UseFPOffset = (ArgMode == 2);
14922  unsigned MaxOffset = TotalNumIntRegs * 8 +
14923                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
14924
14925  /* Align ArgSize to a multiple of 8 */
14926  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
14927  bool NeedsAlign = (Align > 8);
14928
14929  MachineBasicBlock *thisMBB = MBB;
14930  MachineBasicBlock *overflowMBB;
14931  MachineBasicBlock *offsetMBB;
14932  MachineBasicBlock *endMBB;
14933
14934  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
14935  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
14936  unsigned OffsetReg = 0;
14937
14938  if (!UseGPOffset && !UseFPOffset) {
14939    // If we only pull from the overflow region, we don't create a branch.
14940    // We don't need to alter control flow.
14941    OffsetDestReg = 0; // unused
14942    OverflowDestReg = DestReg;
14943
14944    offsetMBB = NULL;
14945    overflowMBB = thisMBB;
14946    endMBB = thisMBB;
14947  } else {
14948    // First emit code to check if gp_offset (or fp_offset) is below the bound.
14949    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
14950    // If not, pull from overflow_area. (branch to overflowMBB)
14951    //
14952    //       thisMBB
14953    //         |     .
14954    //         |        .
14955    //     offsetMBB   overflowMBB
14956    //         |        .
14957    //         |     .
14958    //        endMBB
14959
14960    // Registers for the PHI in endMBB
14961    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
14962    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
14963
14964    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
14965    MachineFunction *MF = MBB->getParent();
14966    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14967    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14968    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
14969
14970    MachineFunction::iterator MBBIter = MBB;
14971    ++MBBIter;
14972
14973    // Insert the new basic blocks
14974    MF->insert(MBBIter, offsetMBB);
14975    MF->insert(MBBIter, overflowMBB);
14976    MF->insert(MBBIter, endMBB);
14977
14978    // Transfer the remainder of MBB and its successor edges to endMBB.
14979    endMBB->splice(endMBB->begin(), thisMBB,
14980                    llvm::next(MachineBasicBlock::iterator(MI)),
14981                    thisMBB->end());
14982    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
14983
14984    // Make offsetMBB and overflowMBB successors of thisMBB
14985    thisMBB->addSuccessor(offsetMBB);
14986    thisMBB->addSuccessor(overflowMBB);
14987
14988    // endMBB is a successor of both offsetMBB and overflowMBB
14989    offsetMBB->addSuccessor(endMBB);
14990    overflowMBB->addSuccessor(endMBB);
14991
14992    // Load the offset value into a register
14993    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
14994    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
14995      .addOperand(Base)
14996      .addOperand(Scale)
14997      .addOperand(Index)
14998      .addDisp(Disp, UseFPOffset ? 4 : 0)
14999      .addOperand(Segment)
15000      .setMemRefs(MMOBegin, MMOEnd);
15001
15002    // Check if there is enough room left to pull this argument.
15003    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
15004      .addReg(OffsetReg)
15005      .addImm(MaxOffset + 8 - ArgSizeA8);
15006
15007    // Branch to "overflowMBB" if offset >= max
15008    // Fall through to "offsetMBB" otherwise
15009    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
15010      .addMBB(overflowMBB);
15011  }
15012
15013  // In offsetMBB, emit code to use the reg_save_area.
15014  if (offsetMBB) {
15015    assert(OffsetReg != 0);
15016
15017    // Read the reg_save_area address.
15018    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
15019    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
15020      .addOperand(Base)
15021      .addOperand(Scale)
15022      .addOperand(Index)
15023      .addDisp(Disp, 16)
15024      .addOperand(Segment)
15025      .setMemRefs(MMOBegin, MMOEnd);
15026
15027    // Zero-extend the offset
15028    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
15029      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
15030        .addImm(0)
15031        .addReg(OffsetReg)
15032        .addImm(X86::sub_32bit);
15033
15034    // Add the offset to the reg_save_area to get the final address.
15035    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
15036      .addReg(OffsetReg64)
15037      .addReg(RegSaveReg);
15038
15039    // Compute the offset for the next argument
15040    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
15041    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
15042      .addReg(OffsetReg)
15043      .addImm(UseFPOffset ? 16 : 8);
15044
15045    // Store it back into the va_list.
15046    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
15047      .addOperand(Base)
15048      .addOperand(Scale)
15049      .addOperand(Index)
15050      .addDisp(Disp, UseFPOffset ? 4 : 0)
15051      .addOperand(Segment)
15052      .addReg(NextOffsetReg)
15053      .setMemRefs(MMOBegin, MMOEnd);
15054
15055    // Jump to endMBB
15056    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
15057      .addMBB(endMBB);
15058  }
15059
15060  //
15061  // Emit code to use overflow area
15062  //
15063
15064  // Load the overflow_area address into a register.
15065  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
15066  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
15067    .addOperand(Base)
15068    .addOperand(Scale)
15069    .addOperand(Index)
15070    .addDisp(Disp, 8)
15071    .addOperand(Segment)
15072    .setMemRefs(MMOBegin, MMOEnd);
15073
15074  // If we need to align it, do so. Otherwise, just copy the address
15075  // to OverflowDestReg.
15076  if (NeedsAlign) {
15077    // Align the overflow address
15078    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
15079    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
15080
15081    // aligned_addr = (addr + (align-1)) & ~(align-1)
15082    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
15083      .addReg(OverflowAddrReg)
15084      .addImm(Align-1);
15085
15086    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
15087      .addReg(TmpReg)
15088      .addImm(~(uint64_t)(Align-1));
15089  } else {
15090    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
15091      .addReg(OverflowAddrReg);
15092  }
15093
15094  // Compute the next overflow address after this argument.
15095  // (the overflow address should be kept 8-byte aligned)
15096  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
15097  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
15098    .addReg(OverflowDestReg)
15099    .addImm(ArgSizeA8);
15100
15101  // Store the new overflow address.
15102  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
15103    .addOperand(Base)
15104    .addOperand(Scale)
15105    .addOperand(Index)
15106    .addDisp(Disp, 8)
15107    .addOperand(Segment)
15108    .addReg(NextAddrReg)
15109    .setMemRefs(MMOBegin, MMOEnd);
15110
15111  // If we branched, emit the PHI to the front of endMBB.
15112  if (offsetMBB) {
15113    BuildMI(*endMBB, endMBB->begin(), DL,
15114            TII->get(X86::PHI), DestReg)
15115      .addReg(OffsetDestReg).addMBB(offsetMBB)
15116      .addReg(OverflowDestReg).addMBB(overflowMBB);
15117  }
15118
15119  // Erase the pseudo instruction
15120  MI->eraseFromParent();
15121
15122  return endMBB;
15123}
15124
15125MachineBasicBlock *
15126X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
15127                                                 MachineInstr *MI,
15128                                                 MachineBasicBlock *MBB) const {
15129  // Emit code to save XMM registers to the stack. The ABI says that the
15130  // number of registers to save is given in %al, so it's theoretically
15131  // possible to do an indirect jump trick to avoid saving all of them,
15132  // however this code takes a simpler approach and just executes all
15133  // of the stores if %al is non-zero. It's less code, and it's probably
15134  // easier on the hardware branch predictor, and stores aren't all that
15135  // expensive anyway.
15136
15137  // Create the new basic blocks. One block contains all the XMM stores,
15138  // and one block is the final destination regardless of whether any
15139  // stores were performed.
15140  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
15141  MachineFunction *F = MBB->getParent();
15142  MachineFunction::iterator MBBIter = MBB;
15143  ++MBBIter;
15144  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
15145  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
15146  F->insert(MBBIter, XMMSaveMBB);
15147  F->insert(MBBIter, EndMBB);
15148
15149  // Transfer the remainder of MBB and its successor edges to EndMBB.
15150  EndMBB->splice(EndMBB->begin(), MBB,
15151                 llvm::next(MachineBasicBlock::iterator(MI)),
15152                 MBB->end());
15153  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
15154
15155  // The original block will now fall through to the XMM save block.
15156  MBB->addSuccessor(XMMSaveMBB);
15157  // The XMMSaveMBB will fall through to the end block.
15158  XMMSaveMBB->addSuccessor(EndMBB);
15159
15160  // Now add the instructions.
15161  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15162  DebugLoc DL = MI->getDebugLoc();
15163
15164  unsigned CountReg = MI->getOperand(0).getReg();
15165  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
15166  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
15167
15168  if (!Subtarget->isTargetWin64()) {
15169    // If %al is 0, branch around the XMM save block.
15170    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
15171    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
15172    MBB->addSuccessor(EndMBB);
15173  }
15174
15175  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
15176  // In the XMM save block, save all the XMM argument registers.
15177  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
15178    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
15179    MachineMemOperand *MMO =
15180      F->getMachineMemOperand(
15181          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
15182        MachineMemOperand::MOStore,
15183        /*Size=*/16, /*Align=*/16);
15184    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
15185      .addFrameIndex(RegSaveFrameIndex)
15186      .addImm(/*Scale=*/1)
15187      .addReg(/*IndexReg=*/0)
15188      .addImm(/*Disp=*/Offset)
15189      .addReg(/*Segment=*/0)
15190      .addReg(MI->getOperand(i).getReg())
15191      .addMemOperand(MMO);
15192  }
15193
15194  MI->eraseFromParent();   // The pseudo instruction is gone now.
15195
15196  return EndMBB;
15197}
15198
15199// The EFLAGS operand of SelectItr might be missing a kill marker
15200// because there were multiple uses of EFLAGS, and ISel didn't know
15201// which to mark. Figure out whether SelectItr should have had a
15202// kill marker, and set it if it should. Returns the correct kill
15203// marker value.
15204static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
15205                                     MachineBasicBlock* BB,
15206                                     const TargetRegisterInfo* TRI) {
15207  // Scan forward through BB for a use/def of EFLAGS.
15208  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
15209  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
15210    const MachineInstr& mi = *miI;
15211    if (mi.readsRegister(X86::EFLAGS))
15212      return false;
15213    if (mi.definesRegister(X86::EFLAGS))
15214      break; // Should have kill-flag - update below.
15215  }
15216
15217  // If we hit the end of the block, check whether EFLAGS is live into a
15218  // successor.
15219  if (miI == BB->end()) {
15220    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
15221                                          sEnd = BB->succ_end();
15222         sItr != sEnd; ++sItr) {
15223      MachineBasicBlock* succ = *sItr;
15224      if (succ->isLiveIn(X86::EFLAGS))
15225        return false;
15226    }
15227  }
15228
15229  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
15230  // out. SelectMI should have a kill flag on EFLAGS.
15231  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
15232  return true;
15233}
15234
15235MachineBasicBlock *
15236X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
15237                                     MachineBasicBlock *BB) const {
15238  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15239  DebugLoc DL = MI->getDebugLoc();
15240
15241  // To "insert" a SELECT_CC instruction, we actually have to insert the
15242  // diamond control-flow pattern.  The incoming instruction knows the
15243  // destination vreg to set, the condition code register to branch on, the
15244  // true/false values to select between, and a branch opcode to use.
15245  const BasicBlock *LLVM_BB = BB->getBasicBlock();
15246  MachineFunction::iterator It = BB;
15247  ++It;
15248
15249  //  thisMBB:
15250  //  ...
15251  //   TrueVal = ...
15252  //   cmpTY ccX, r1, r2
15253  //   bCC copy1MBB
15254  //   fallthrough --> copy0MBB
15255  MachineBasicBlock *thisMBB = BB;
15256  MachineFunction *F = BB->getParent();
15257  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
15258  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
15259  F->insert(It, copy0MBB);
15260  F->insert(It, sinkMBB);
15261
15262  // If the EFLAGS register isn't dead in the terminator, then claim that it's
15263  // live into the sink and copy blocks.
15264  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
15265  if (!MI->killsRegister(X86::EFLAGS) &&
15266      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
15267    copy0MBB->addLiveIn(X86::EFLAGS);
15268    sinkMBB->addLiveIn(X86::EFLAGS);
15269  }
15270
15271  // Transfer the remainder of BB and its successor edges to sinkMBB.
15272  sinkMBB->splice(sinkMBB->begin(), BB,
15273                  llvm::next(MachineBasicBlock::iterator(MI)),
15274                  BB->end());
15275  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
15276
15277  // Add the true and fallthrough blocks as its successors.
15278  BB->addSuccessor(copy0MBB);
15279  BB->addSuccessor(sinkMBB);
15280
15281  // Create the conditional branch instruction.
15282  unsigned Opc =
15283    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
15284  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
15285
15286  //  copy0MBB:
15287  //   %FalseValue = ...
15288  //   # fallthrough to sinkMBB
15289  copy0MBB->addSuccessor(sinkMBB);
15290
15291  //  sinkMBB:
15292  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
15293  //  ...
15294  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
15295          TII->get(X86::PHI), MI->getOperand(0).getReg())
15296    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
15297    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
15298
15299  MI->eraseFromParent();   // The pseudo instruction is gone now.
15300  return sinkMBB;
15301}
15302
15303MachineBasicBlock *
15304X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
15305                                        bool Is64Bit) const {
15306  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15307  DebugLoc DL = MI->getDebugLoc();
15308  MachineFunction *MF = BB->getParent();
15309  const BasicBlock *LLVM_BB = BB->getBasicBlock();
15310
15311  assert(getTargetMachine().Options.EnableSegmentedStacks);
15312
15313  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
15314  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
15315
15316  // BB:
15317  //  ... [Till the alloca]
15318  // If stacklet is not large enough, jump to mallocMBB
15319  //
15320  // bumpMBB:
15321  //  Allocate by subtracting from RSP
15322  //  Jump to continueMBB
15323  //
15324  // mallocMBB:
15325  //  Allocate by call to runtime
15326  //
15327  // continueMBB:
15328  //  ...
15329  //  [rest of original BB]
15330  //
15331
15332  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15333  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15334  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
15335
15336  MachineRegisterInfo &MRI = MF->getRegInfo();
15337  const TargetRegisterClass *AddrRegClass =
15338    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
15339
15340  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
15341    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
15342    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
15343    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
15344    sizeVReg = MI->getOperand(1).getReg(),
15345    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
15346
15347  MachineFunction::iterator MBBIter = BB;
15348  ++MBBIter;
15349
15350  MF->insert(MBBIter, bumpMBB);
15351  MF->insert(MBBIter, mallocMBB);
15352  MF->insert(MBBIter, continueMBB);
15353
15354  continueMBB->splice(continueMBB->begin(), BB, llvm::next
15355                      (MachineBasicBlock::iterator(MI)), BB->end());
15356  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
15357
15358  // Add code to the main basic block to check if the stack limit has been hit,
15359  // and if so, jump to mallocMBB otherwise to bumpMBB.
15360  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
15361  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
15362    .addReg(tmpSPVReg).addReg(sizeVReg);
15363  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
15364    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
15365    .addReg(SPLimitVReg);
15366  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
15367
15368  // bumpMBB simply decreases the stack pointer, since we know the current
15369  // stacklet has enough space.
15370  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
15371    .addReg(SPLimitVReg);
15372  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
15373    .addReg(SPLimitVReg);
15374  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
15375
15376  // Calls into a routine in libgcc to allocate more space from the heap.
15377  const uint32_t *RegMask =
15378    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
15379  if (Is64Bit) {
15380    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
15381      .addReg(sizeVReg);
15382    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
15383      .addExternalSymbol("__morestack_allocate_stack_space")
15384      .addRegMask(RegMask)
15385      .addReg(X86::RDI, RegState::Implicit)
15386      .addReg(X86::RAX, RegState::ImplicitDefine);
15387  } else {
15388    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
15389      .addImm(12);
15390    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
15391    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
15392      .addExternalSymbol("__morestack_allocate_stack_space")
15393      .addRegMask(RegMask)
15394      .addReg(X86::EAX, RegState::ImplicitDefine);
15395  }
15396
15397  if (!Is64Bit)
15398    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
15399      .addImm(16);
15400
15401  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
15402    .addReg(Is64Bit ? X86::RAX : X86::EAX);
15403  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
15404
15405  // Set up the CFG correctly.
15406  BB->addSuccessor(bumpMBB);
15407  BB->addSuccessor(mallocMBB);
15408  mallocMBB->addSuccessor(continueMBB);
15409  bumpMBB->addSuccessor(continueMBB);
15410
15411  // Take care of the PHI nodes.
15412  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
15413          MI->getOperand(0).getReg())
15414    .addReg(mallocPtrVReg).addMBB(mallocMBB)
15415    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
15416
15417  // Delete the original pseudo instruction.
15418  MI->eraseFromParent();
15419
15420  // And we're done.
15421  return continueMBB;
15422}
15423
15424MachineBasicBlock *
15425X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
15426                                          MachineBasicBlock *BB) const {
15427  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15428  DebugLoc DL = MI->getDebugLoc();
15429
15430  assert(!Subtarget->isTargetEnvMacho());
15431
15432  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
15433  // non-trivial part is impdef of ESP.
15434
15435  if (Subtarget->isTargetWin64()) {
15436    if (Subtarget->isTargetCygMing()) {
15437      // ___chkstk(Mingw64):
15438      // Clobbers R10, R11, RAX and EFLAGS.
15439      // Updates RSP.
15440      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
15441        .addExternalSymbol("___chkstk")
15442        .addReg(X86::RAX, RegState::Implicit)
15443        .addReg(X86::RSP, RegState::Implicit)
15444        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
15445        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
15446        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
15447    } else {
15448      // __chkstk(MSVCRT): does not update stack pointer.
15449      // Clobbers R10, R11 and EFLAGS.
15450      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
15451        .addExternalSymbol("__chkstk")
15452        .addReg(X86::RAX, RegState::Implicit)
15453        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
15454      // RAX has the offset to be subtracted from RSP.
15455      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
15456        .addReg(X86::RSP)
15457        .addReg(X86::RAX);
15458    }
15459  } else {
15460    const char *StackProbeSymbol =
15461      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
15462
15463    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
15464      .addExternalSymbol(StackProbeSymbol)
15465      .addReg(X86::EAX, RegState::Implicit)
15466      .addReg(X86::ESP, RegState::Implicit)
15467      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
15468      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
15469      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
15470  }
15471
15472  MI->eraseFromParent();   // The pseudo instruction is gone now.
15473  return BB;
15474}
15475
15476MachineBasicBlock *
15477X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
15478                                      MachineBasicBlock *BB) const {
15479  // This is pretty easy.  We're taking the value that we received from
15480  // our load from the relocation, sticking it in either RDI (x86-64)
15481  // or EAX and doing an indirect call.  The return value will then
15482  // be in the normal return register.
15483  const X86InstrInfo *TII
15484    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
15485  DebugLoc DL = MI->getDebugLoc();
15486  MachineFunction *F = BB->getParent();
15487
15488  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
15489  assert(MI->getOperand(3).isGlobal() && "This should be a global");
15490
15491  // Get a register mask for the lowered call.
15492  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
15493  // proper register mask.
15494  const uint32_t *RegMask =
15495    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
15496  if (Subtarget->is64Bit()) {
15497    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
15498                                      TII->get(X86::MOV64rm), X86::RDI)
15499    .addReg(X86::RIP)
15500    .addImm(0).addReg(0)
15501    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
15502                      MI->getOperand(3).getTargetFlags())
15503    .addReg(0);
15504    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
15505    addDirectMem(MIB, X86::RDI);
15506    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
15507  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
15508    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
15509                                      TII->get(X86::MOV32rm), X86::EAX)
15510    .addReg(0)
15511    .addImm(0).addReg(0)
15512    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
15513                      MI->getOperand(3).getTargetFlags())
15514    .addReg(0);
15515    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
15516    addDirectMem(MIB, X86::EAX);
15517    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
15518  } else {
15519    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
15520                                      TII->get(X86::MOV32rm), X86::EAX)
15521    .addReg(TII->getGlobalBaseReg(F))
15522    .addImm(0).addReg(0)
15523    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
15524                      MI->getOperand(3).getTargetFlags())
15525    .addReg(0);
15526    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
15527    addDirectMem(MIB, X86::EAX);
15528    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
15529  }
15530
15531  MI->eraseFromParent(); // The pseudo instruction is gone now.
15532  return BB;
15533}
15534
15535MachineBasicBlock *
15536X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
15537                                    MachineBasicBlock *MBB) const {
15538  DebugLoc DL = MI->getDebugLoc();
15539  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15540
15541  MachineFunction *MF = MBB->getParent();
15542  MachineRegisterInfo &MRI = MF->getRegInfo();
15543
15544  const BasicBlock *BB = MBB->getBasicBlock();
15545  MachineFunction::iterator I = MBB;
15546  ++I;
15547
15548  // Memory Reference
15549  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
15550  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
15551
15552  unsigned DstReg;
15553  unsigned MemOpndSlot = 0;
15554
15555  unsigned CurOp = 0;
15556
15557  DstReg = MI->getOperand(CurOp++).getReg();
15558  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
15559  assert(RC->hasType(MVT::i32) && "Invalid destination!");
15560  unsigned mainDstReg = MRI.createVirtualRegister(RC);
15561  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
15562
15563  MemOpndSlot = CurOp;
15564
15565  MVT PVT = getPointerTy();
15566  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
15567         "Invalid Pointer Size!");
15568
15569  // For v = setjmp(buf), we generate
15570  //
15571  // thisMBB:
15572  //  buf[LabelOffset] = restoreMBB
15573  //  SjLjSetup restoreMBB
15574  //
15575  // mainMBB:
15576  //  v_main = 0
15577  //
15578  // sinkMBB:
15579  //  v = phi(main, restore)
15580  //
15581  // restoreMBB:
15582  //  v_restore = 1
15583
15584  MachineBasicBlock *thisMBB = MBB;
15585  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
15586  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
15587  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
15588  MF->insert(I, mainMBB);
15589  MF->insert(I, sinkMBB);
15590  MF->push_back(restoreMBB);
15591
15592  MachineInstrBuilder MIB;
15593
15594  // Transfer the remainder of BB and its successor edges to sinkMBB.
15595  sinkMBB->splice(sinkMBB->begin(), MBB,
15596                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
15597  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
15598
15599  // thisMBB:
15600  unsigned PtrStoreOpc = 0;
15601  unsigned LabelReg = 0;
15602  const int64_t LabelOffset = 1 * PVT.getStoreSize();
15603  Reloc::Model RM = getTargetMachine().getRelocationModel();
15604  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
15605                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
15606
15607  // Prepare IP either in reg or imm.
15608  if (!UseImmLabel) {
15609    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
15610    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
15611    LabelReg = MRI.createVirtualRegister(PtrRC);
15612    if (Subtarget->is64Bit()) {
15613      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
15614              .addReg(X86::RIP)
15615              .addImm(0)
15616              .addReg(0)
15617              .addMBB(restoreMBB)
15618              .addReg(0);
15619    } else {
15620      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
15621      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
15622              .addReg(XII->getGlobalBaseReg(MF))
15623              .addImm(0)
15624              .addReg(0)
15625              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
15626              .addReg(0);
15627    }
15628  } else
15629    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
15630  // Store IP
15631  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
15632  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
15633    if (i == X86::AddrDisp)
15634      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
15635    else
15636      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
15637  }
15638  if (!UseImmLabel)
15639    MIB.addReg(LabelReg);
15640  else
15641    MIB.addMBB(restoreMBB);
15642  MIB.setMemRefs(MMOBegin, MMOEnd);
15643  // Setup
15644  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
15645          .addMBB(restoreMBB);
15646
15647  const X86RegisterInfo *RegInfo =
15648    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
15649  MIB.addRegMask(RegInfo->getNoPreservedMask());
15650  thisMBB->addSuccessor(mainMBB);
15651  thisMBB->addSuccessor(restoreMBB);
15652
15653  // mainMBB:
15654  //  EAX = 0
15655  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
15656  mainMBB->addSuccessor(sinkMBB);
15657
15658  // sinkMBB:
15659  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
15660          TII->get(X86::PHI), DstReg)
15661    .addReg(mainDstReg).addMBB(mainMBB)
15662    .addReg(restoreDstReg).addMBB(restoreMBB);
15663
15664  // restoreMBB:
15665  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
15666  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
15667  restoreMBB->addSuccessor(sinkMBB);
15668
15669  MI->eraseFromParent();
15670  return sinkMBB;
15671}
15672
15673MachineBasicBlock *
15674X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
15675                                     MachineBasicBlock *MBB) const {
15676  DebugLoc DL = MI->getDebugLoc();
15677  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15678
15679  MachineFunction *MF = MBB->getParent();
15680  MachineRegisterInfo &MRI = MF->getRegInfo();
15681
15682  // Memory Reference
15683  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
15684  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
15685
15686  MVT PVT = getPointerTy();
15687  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
15688         "Invalid Pointer Size!");
15689
15690  const TargetRegisterClass *RC =
15691    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
15692  unsigned Tmp = MRI.createVirtualRegister(RC);
15693  // Since FP is only updated here but NOT referenced, it's treated as GPR.
15694  const X86RegisterInfo *RegInfo =
15695    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
15696  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
15697  unsigned SP = RegInfo->getStackRegister();
15698
15699  MachineInstrBuilder MIB;
15700
15701  const int64_t LabelOffset = 1 * PVT.getStoreSize();
15702  const int64_t SPOffset = 2 * PVT.getStoreSize();
15703
15704  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
15705  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
15706
15707  // Reload FP
15708  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
15709  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
15710    MIB.addOperand(MI->getOperand(i));
15711  MIB.setMemRefs(MMOBegin, MMOEnd);
15712  // Reload IP
15713  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
15714  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
15715    if (i == X86::AddrDisp)
15716      MIB.addDisp(MI->getOperand(i), LabelOffset);
15717    else
15718      MIB.addOperand(MI->getOperand(i));
15719  }
15720  MIB.setMemRefs(MMOBegin, MMOEnd);
15721  // Reload SP
15722  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
15723  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
15724    if (i == X86::AddrDisp)
15725      MIB.addDisp(MI->getOperand(i), SPOffset);
15726    else
15727      MIB.addOperand(MI->getOperand(i));
15728  }
15729  MIB.setMemRefs(MMOBegin, MMOEnd);
15730  // Jump
15731  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
15732
15733  MI->eraseFromParent();
15734  return MBB;
15735}
15736
15737MachineBasicBlock *
15738X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
15739                                               MachineBasicBlock *BB) const {
15740  switch (MI->getOpcode()) {
15741  default: llvm_unreachable("Unexpected instr type to insert");
15742  case X86::TAILJMPd64:
15743  case X86::TAILJMPr64:
15744  case X86::TAILJMPm64:
15745    llvm_unreachable("TAILJMP64 would not be touched here.");
15746  case X86::TCRETURNdi64:
15747  case X86::TCRETURNri64:
15748  case X86::TCRETURNmi64:
15749    return BB;
15750  case X86::WIN_ALLOCA:
15751    return EmitLoweredWinAlloca(MI, BB);
15752  case X86::SEG_ALLOCA_32:
15753    return EmitLoweredSegAlloca(MI, BB, false);
15754  case X86::SEG_ALLOCA_64:
15755    return EmitLoweredSegAlloca(MI, BB, true);
15756  case X86::TLSCall_32:
15757  case X86::TLSCall_64:
15758    return EmitLoweredTLSCall(MI, BB);
15759  case X86::CMOV_GR8:
15760  case X86::CMOV_FR32:
15761  case X86::CMOV_FR64:
15762  case X86::CMOV_V4F32:
15763  case X86::CMOV_V2F64:
15764  case X86::CMOV_V2I64:
15765  case X86::CMOV_V8F32:
15766  case X86::CMOV_V4F64:
15767  case X86::CMOV_V4I64:
15768  case X86::CMOV_V16F32:
15769  case X86::CMOV_V8F64:
15770  case X86::CMOV_V8I64:
15771  case X86::CMOV_GR16:
15772  case X86::CMOV_GR32:
15773  case X86::CMOV_RFP32:
15774  case X86::CMOV_RFP64:
15775  case X86::CMOV_RFP80:
15776    return EmitLoweredSelect(MI, BB);
15777
15778  case X86::FP32_TO_INT16_IN_MEM:
15779  case X86::FP32_TO_INT32_IN_MEM:
15780  case X86::FP32_TO_INT64_IN_MEM:
15781  case X86::FP64_TO_INT16_IN_MEM:
15782  case X86::FP64_TO_INT32_IN_MEM:
15783  case X86::FP64_TO_INT64_IN_MEM:
15784  case X86::FP80_TO_INT16_IN_MEM:
15785  case X86::FP80_TO_INT32_IN_MEM:
15786  case X86::FP80_TO_INT64_IN_MEM: {
15787    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
15788    DebugLoc DL = MI->getDebugLoc();
15789
15790    // Change the floating point control register to use "round towards zero"
15791    // mode when truncating to an integer value.
15792    MachineFunction *F = BB->getParent();
15793    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
15794    addFrameReference(BuildMI(*BB, MI, DL,
15795                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
15796
15797    // Load the old value of the high byte of the control word...
15798    unsigned OldCW =
15799      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
15800    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
15801                      CWFrameIdx);
15802
15803    // Set the high part to be round to zero...
15804    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
15805      .addImm(0xC7F);
15806
15807    // Reload the modified control word now...
15808    addFrameReference(BuildMI(*BB, MI, DL,
15809                              TII->get(X86::FLDCW16m)), CWFrameIdx);
15810
15811    // Restore the memory image of control word to original value
15812    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
15813      .addReg(OldCW);
15814
15815    // Get the X86 opcode to use.
15816    unsigned Opc;
15817    switch (MI->getOpcode()) {
15818    default: llvm_unreachable("illegal opcode!");
15819    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
15820    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
15821    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
15822    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
15823    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
15824    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
15825    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
15826    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
15827    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
15828    }
15829
15830    X86AddressMode AM;
15831    MachineOperand &Op = MI->getOperand(0);
15832    if (Op.isReg()) {
15833      AM.BaseType = X86AddressMode::RegBase;
15834      AM.Base.Reg = Op.getReg();
15835    } else {
15836      AM.BaseType = X86AddressMode::FrameIndexBase;
15837      AM.Base.FrameIndex = Op.getIndex();
15838    }
15839    Op = MI->getOperand(1);
15840    if (Op.isImm())
15841      AM.Scale = Op.getImm();
15842    Op = MI->getOperand(2);
15843    if (Op.isImm())
15844      AM.IndexReg = Op.getImm();
15845    Op = MI->getOperand(3);
15846    if (Op.isGlobal()) {
15847      AM.GV = Op.getGlobal();
15848    } else {
15849      AM.Disp = Op.getImm();
15850    }
15851    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
15852                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
15853
15854    // Reload the original control word now.
15855    addFrameReference(BuildMI(*BB, MI, DL,
15856                              TII->get(X86::FLDCW16m)), CWFrameIdx);
15857
15858    MI->eraseFromParent();   // The pseudo instruction is gone now.
15859    return BB;
15860  }
15861    // String/text processing lowering.
15862  case X86::PCMPISTRM128REG:
15863  case X86::VPCMPISTRM128REG:
15864  case X86::PCMPISTRM128MEM:
15865  case X86::VPCMPISTRM128MEM:
15866  case X86::PCMPESTRM128REG:
15867  case X86::VPCMPESTRM128REG:
15868  case X86::PCMPESTRM128MEM:
15869  case X86::VPCMPESTRM128MEM:
15870    assert(Subtarget->hasSSE42() &&
15871           "Target must have SSE4.2 or AVX features enabled");
15872    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
15873
15874  // String/text processing lowering.
15875  case X86::PCMPISTRIREG:
15876  case X86::VPCMPISTRIREG:
15877  case X86::PCMPISTRIMEM:
15878  case X86::VPCMPISTRIMEM:
15879  case X86::PCMPESTRIREG:
15880  case X86::VPCMPESTRIREG:
15881  case X86::PCMPESTRIMEM:
15882  case X86::VPCMPESTRIMEM:
15883    assert(Subtarget->hasSSE42() &&
15884           "Target must have SSE4.2 or AVX features enabled");
15885    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
15886
15887  // Thread synchronization.
15888  case X86::MONITOR:
15889    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
15890
15891  // xbegin
15892  case X86::XBEGIN:
15893    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
15894
15895  // Atomic Lowering.
15896  case X86::ATOMAND8:
15897  case X86::ATOMAND16:
15898  case X86::ATOMAND32:
15899  case X86::ATOMAND64:
15900    // Fall through
15901  case X86::ATOMOR8:
15902  case X86::ATOMOR16:
15903  case X86::ATOMOR32:
15904  case X86::ATOMOR64:
15905    // Fall through
15906  case X86::ATOMXOR16:
15907  case X86::ATOMXOR8:
15908  case X86::ATOMXOR32:
15909  case X86::ATOMXOR64:
15910    // Fall through
15911  case X86::ATOMNAND8:
15912  case X86::ATOMNAND16:
15913  case X86::ATOMNAND32:
15914  case X86::ATOMNAND64:
15915    // Fall through
15916  case X86::ATOMMAX8:
15917  case X86::ATOMMAX16:
15918  case X86::ATOMMAX32:
15919  case X86::ATOMMAX64:
15920    // Fall through
15921  case X86::ATOMMIN8:
15922  case X86::ATOMMIN16:
15923  case X86::ATOMMIN32:
15924  case X86::ATOMMIN64:
15925    // Fall through
15926  case X86::ATOMUMAX8:
15927  case X86::ATOMUMAX16:
15928  case X86::ATOMUMAX32:
15929  case X86::ATOMUMAX64:
15930    // Fall through
15931  case X86::ATOMUMIN8:
15932  case X86::ATOMUMIN16:
15933  case X86::ATOMUMIN32:
15934  case X86::ATOMUMIN64:
15935    return EmitAtomicLoadArith(MI, BB);
15936
15937  // This group does 64-bit operations on a 32-bit host.
15938  case X86::ATOMAND6432:
15939  case X86::ATOMOR6432:
15940  case X86::ATOMXOR6432:
15941  case X86::ATOMNAND6432:
15942  case X86::ATOMADD6432:
15943  case X86::ATOMSUB6432:
15944  case X86::ATOMMAX6432:
15945  case X86::ATOMMIN6432:
15946  case X86::ATOMUMAX6432:
15947  case X86::ATOMUMIN6432:
15948  case X86::ATOMSWAP6432:
15949    return EmitAtomicLoadArith6432(MI, BB);
15950
15951  case X86::VASTART_SAVE_XMM_REGS:
15952    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
15953
15954  case X86::VAARG_64:
15955    return EmitVAARG64WithCustomInserter(MI, BB);
15956
15957  case X86::EH_SjLj_SetJmp32:
15958  case X86::EH_SjLj_SetJmp64:
15959    return emitEHSjLjSetJmp(MI, BB);
15960
15961  case X86::EH_SjLj_LongJmp32:
15962  case X86::EH_SjLj_LongJmp64:
15963    return emitEHSjLjLongJmp(MI, BB);
15964  }
15965}
15966
15967//===----------------------------------------------------------------------===//
15968//                           X86 Optimization Hooks
15969//===----------------------------------------------------------------------===//
15970
15971void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
15972                                                       APInt &KnownZero,
15973                                                       APInt &KnownOne,
15974                                                       const SelectionDAG &DAG,
15975                                                       unsigned Depth) const {
15976  unsigned BitWidth = KnownZero.getBitWidth();
15977  unsigned Opc = Op.getOpcode();
15978  assert((Opc >= ISD::BUILTIN_OP_END ||
15979          Opc == ISD::INTRINSIC_WO_CHAIN ||
15980          Opc == ISD::INTRINSIC_W_CHAIN ||
15981          Opc == ISD::INTRINSIC_VOID) &&
15982         "Should use MaskedValueIsZero if you don't know whether Op"
15983         " is a target node!");
15984
15985  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
15986  switch (Opc) {
15987  default: break;
15988  case X86ISD::ADD:
15989  case X86ISD::SUB:
15990  case X86ISD::ADC:
15991  case X86ISD::SBB:
15992  case X86ISD::SMUL:
15993  case X86ISD::UMUL:
15994  case X86ISD::INC:
15995  case X86ISD::DEC:
15996  case X86ISD::OR:
15997  case X86ISD::XOR:
15998  case X86ISD::AND:
15999    // These nodes' second result is a boolean.
16000    if (Op.getResNo() == 0)
16001      break;
16002    // Fallthrough
16003  case X86ISD::SETCC:
16004    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
16005    break;
16006  case ISD::INTRINSIC_WO_CHAIN: {
16007    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
16008    unsigned NumLoBits = 0;
16009    switch (IntId) {
16010    default: break;
16011    case Intrinsic::x86_sse_movmsk_ps:
16012    case Intrinsic::x86_avx_movmsk_ps_256:
16013    case Intrinsic::x86_sse2_movmsk_pd:
16014    case Intrinsic::x86_avx_movmsk_pd_256:
16015    case Intrinsic::x86_mmx_pmovmskb:
16016    case Intrinsic::x86_sse2_pmovmskb_128:
16017    case Intrinsic::x86_avx2_pmovmskb: {
16018      // High bits of movmskp{s|d}, pmovmskb are known zero.
16019      switch (IntId) {
16020        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
16021        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
16022        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
16023        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
16024        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
16025        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
16026        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
16027        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
16028      }
16029      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
16030      break;
16031    }
16032    }
16033    break;
16034  }
16035  }
16036}
16037
16038unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
16039                                                         unsigned Depth) const {
16040  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
16041  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
16042    return Op.getValueType().getScalarType().getSizeInBits();
16043
16044  // Fallback case.
16045  return 1;
16046}
16047
16048/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
16049/// node is a GlobalAddress + offset.
16050bool X86TargetLowering::isGAPlusOffset(SDNode *N,
16051                                       const GlobalValue* &GA,
16052                                       int64_t &Offset) const {
16053  if (N->getOpcode() == X86ISD::Wrapper) {
16054    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
16055      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
16056      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
16057      return true;
16058    }
16059  }
16060  return TargetLowering::isGAPlusOffset(N, GA, Offset);
16061}
16062
16063/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
16064/// same as extracting the high 128-bit part of 256-bit vector and then
16065/// inserting the result into the low part of a new 256-bit vector
16066static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
16067  EVT VT = SVOp->getValueType(0);
16068  unsigned NumElems = VT.getVectorNumElements();
16069
16070  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16071  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
16072    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
16073        SVOp->getMaskElt(j) >= 0)
16074      return false;
16075
16076  return true;
16077}
16078
16079/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
16080/// same as extracting the low 128-bit part of 256-bit vector and then
16081/// inserting the result into the high part of a new 256-bit vector
16082static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
16083  EVT VT = SVOp->getValueType(0);
16084  unsigned NumElems = VT.getVectorNumElements();
16085
16086  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16087  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
16088    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
16089        SVOp->getMaskElt(j) >= 0)
16090      return false;
16091
16092  return true;
16093}
16094
16095/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
16096static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
16097                                        TargetLowering::DAGCombinerInfo &DCI,
16098                                        const X86Subtarget* Subtarget) {
16099  SDLoc dl(N);
16100  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
16101  SDValue V1 = SVOp->getOperand(0);
16102  SDValue V2 = SVOp->getOperand(1);
16103  EVT VT = SVOp->getValueType(0);
16104  unsigned NumElems = VT.getVectorNumElements();
16105
16106  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
16107      V2.getOpcode() == ISD::CONCAT_VECTORS) {
16108    //
16109    //                   0,0,0,...
16110    //                      |
16111    //    V      UNDEF    BUILD_VECTOR    UNDEF
16112    //     \      /           \           /
16113    //  CONCAT_VECTOR         CONCAT_VECTOR
16114    //         \                  /
16115    //          \                /
16116    //          RESULT: V + zero extended
16117    //
16118    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
16119        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
16120        V1.getOperand(1).getOpcode() != ISD::UNDEF)
16121      return SDValue();
16122
16123    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
16124      return SDValue();
16125
16126    // To match the shuffle mask, the first half of the mask should
16127    // be exactly the first vector, and all the rest a splat with the
16128    // first element of the second one.
16129    for (unsigned i = 0; i != NumElems/2; ++i)
16130      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
16131          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
16132        return SDValue();
16133
16134    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
16135    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
16136      if (Ld->hasNUsesOfValue(1, 0)) {
16137        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
16138        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
16139        SDValue ResNode =
16140          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
16141                                  array_lengthof(Ops),
16142                                  Ld->getMemoryVT(),
16143                                  Ld->getPointerInfo(),
16144                                  Ld->getAlignment(),
16145                                  false/*isVolatile*/, true/*ReadMem*/,
16146                                  false/*WriteMem*/);
16147
16148        // Make sure the newly-created LOAD is in the same position as Ld in
16149        // terms of dependency. We create a TokenFactor for Ld and ResNode,
16150        // and update uses of Ld's output chain to use the TokenFactor.
16151        if (Ld->hasAnyUseOfValue(1)) {
16152          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16153                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
16154          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16155          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
16156                                 SDValue(ResNode.getNode(), 1));
16157        }
16158
16159        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
16160      }
16161    }
16162
16163    // Emit a zeroed vector and insert the desired subvector on its
16164    // first half.
16165    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
16166    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
16167    return DCI.CombineTo(N, InsV);
16168  }
16169
16170  //===--------------------------------------------------------------------===//
16171  // Combine some shuffles into subvector extracts and inserts:
16172  //
16173
16174  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16175  if (isShuffleHigh128VectorInsertLow(SVOp)) {
16176    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
16177    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
16178    return DCI.CombineTo(N, InsV);
16179  }
16180
16181  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16182  if (isShuffleLow128VectorInsertHigh(SVOp)) {
16183    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
16184    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
16185    return DCI.CombineTo(N, InsV);
16186  }
16187
16188  return SDValue();
16189}
16190
16191/// PerformShuffleCombine - Performs several different shuffle combines.
16192static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
16193                                     TargetLowering::DAGCombinerInfo &DCI,
16194                                     const X86Subtarget *Subtarget) {
16195  SDLoc dl(N);
16196  EVT VT = N->getValueType(0);
16197
16198  // Don't create instructions with illegal types after legalize types has run.
16199  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16200  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
16201    return SDValue();
16202
16203  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
16204  if (Subtarget->hasFp256() && VT.is256BitVector() &&
16205      N->getOpcode() == ISD::VECTOR_SHUFFLE)
16206    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
16207
16208  // Only handle 128 wide vector from here on.
16209  if (!VT.is128BitVector())
16210    return SDValue();
16211
16212  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
16213  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
16214  // consecutive, non-overlapping, and in the right order.
16215  SmallVector<SDValue, 16> Elts;
16216  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
16217    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
16218
16219  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
16220}
16221
16222/// PerformTruncateCombine - Converts truncate operation to
16223/// a sequence of vector shuffle operations.
16224/// It is possible when we truncate 256-bit vector to 128-bit vector
16225static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
16226                                      TargetLowering::DAGCombinerInfo &DCI,
16227                                      const X86Subtarget *Subtarget)  {
16228  return SDValue();
16229}
16230
16231/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
16232/// specific shuffle of a load can be folded into a single element load.
16233/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
16234/// shuffles have been customed lowered so we need to handle those here.
16235static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
16236                                         TargetLowering::DAGCombinerInfo &DCI) {
16237  if (DCI.isBeforeLegalizeOps())
16238    return SDValue();
16239
16240  SDValue InVec = N->getOperand(0);
16241  SDValue EltNo = N->getOperand(1);
16242
16243  if (!isa<ConstantSDNode>(EltNo))
16244    return SDValue();
16245
16246  EVT VT = InVec.getValueType();
16247
16248  bool HasShuffleIntoBitcast = false;
16249  if (InVec.getOpcode() == ISD::BITCAST) {
16250    // Don't duplicate a load with other uses.
16251    if (!InVec.hasOneUse())
16252      return SDValue();
16253    EVT BCVT = InVec.getOperand(0).getValueType();
16254    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
16255      return SDValue();
16256    InVec = InVec.getOperand(0);
16257    HasShuffleIntoBitcast = true;
16258  }
16259
16260  if (!isTargetShuffle(InVec.getOpcode()))
16261    return SDValue();
16262
16263  // Don't duplicate a load with other uses.
16264  if (!InVec.hasOneUse())
16265    return SDValue();
16266
16267  SmallVector<int, 16> ShuffleMask;
16268  bool UnaryShuffle;
16269  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
16270                            UnaryShuffle))
16271    return SDValue();
16272
16273  // Select the input vector, guarding against out of range extract vector.
16274  unsigned NumElems = VT.getVectorNumElements();
16275  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
16276  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
16277  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
16278                                         : InVec.getOperand(1);
16279
16280  // If inputs to shuffle are the same for both ops, then allow 2 uses
16281  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
16282
16283  if (LdNode.getOpcode() == ISD::BITCAST) {
16284    // Don't duplicate a load with other uses.
16285    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
16286      return SDValue();
16287
16288    AllowedUses = 1; // only allow 1 load use if we have a bitcast
16289    LdNode = LdNode.getOperand(0);
16290  }
16291
16292  if (!ISD::isNormalLoad(LdNode.getNode()))
16293    return SDValue();
16294
16295  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
16296
16297  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
16298    return SDValue();
16299
16300  if (HasShuffleIntoBitcast) {
16301    // If there's a bitcast before the shuffle, check if the load type and
16302    // alignment is valid.
16303    unsigned Align = LN0->getAlignment();
16304    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16305    unsigned NewAlign = TLI.getDataLayout()->
16306      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
16307
16308    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
16309      return SDValue();
16310  }
16311
16312  // All checks match so transform back to vector_shuffle so that DAG combiner
16313  // can finish the job
16314  SDLoc dl(N);
16315
16316  // Create shuffle node taking into account the case that its a unary shuffle
16317  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
16318  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
16319                                 InVec.getOperand(0), Shuffle,
16320                                 &ShuffleMask[0]);
16321  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
16322  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
16323                     EltNo);
16324}
16325
16326/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
16327/// generation and convert it from being a bunch of shuffles and extracts
16328/// to a simple store and scalar loads to extract the elements.
16329static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
16330                                         TargetLowering::DAGCombinerInfo &DCI) {
16331  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
16332  if (NewOp.getNode())
16333    return NewOp;
16334
16335  SDValue InputVector = N->getOperand(0);
16336  // Detect whether we are trying to convert from mmx to i32 and the bitcast
16337  // from mmx to v2i32 has a single usage.
16338  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
16339      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
16340      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
16341    return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
16342                       N->getValueType(0),
16343                       InputVector.getNode()->getOperand(0));
16344
16345  // Only operate on vectors of 4 elements, where the alternative shuffling
16346  // gets to be more expensive.
16347  if (InputVector.getValueType() != MVT::v4i32)
16348    return SDValue();
16349
16350  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
16351  // single use which is a sign-extend or zero-extend, and all elements are
16352  // used.
16353  SmallVector<SDNode *, 4> Uses;
16354  unsigned ExtractedElements = 0;
16355  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
16356       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
16357    if (UI.getUse().getResNo() != InputVector.getResNo())
16358      return SDValue();
16359
16360    SDNode *Extract = *UI;
16361    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16362      return SDValue();
16363
16364    if (Extract->getValueType(0) != MVT::i32)
16365      return SDValue();
16366    if (!Extract->hasOneUse())
16367      return SDValue();
16368    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
16369        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
16370      return SDValue();
16371    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
16372      return SDValue();
16373
16374    // Record which element was extracted.
16375    ExtractedElements |=
16376      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
16377
16378    Uses.push_back(Extract);
16379  }
16380
16381  // If not all the elements were used, this may not be worthwhile.
16382  if (ExtractedElements != 15)
16383    return SDValue();
16384
16385  // Ok, we've now decided to do the transformation.
16386  SDLoc dl(InputVector);
16387
16388  // Store the value to a temporary stack slot.
16389  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
16390  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
16391                            MachinePointerInfo(), false, false, 0);
16392
16393  // Replace each use (extract) with a load of the appropriate element.
16394  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
16395       UE = Uses.end(); UI != UE; ++UI) {
16396    SDNode *Extract = *UI;
16397
16398    // cOMpute the element's address.
16399    SDValue Idx = Extract->getOperand(1);
16400    unsigned EltSize =
16401        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
16402    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
16403    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16404    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
16405
16406    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
16407                                     StackPtr, OffsetVal);
16408
16409    // Load the scalar.
16410    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
16411                                     ScalarAddr, MachinePointerInfo(),
16412                                     false, false, false, 0);
16413
16414    // Replace the exact with the load.
16415    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
16416  }
16417
16418  // The replacement was made in place; don't return anything.
16419  return SDValue();
16420}
16421
16422/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
16423static std::pair<unsigned, bool>
16424matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
16425                   SelectionDAG &DAG, const X86Subtarget *Subtarget) {
16426  if (!VT.isVector())
16427    return std::make_pair(0, false);
16428
16429  bool NeedSplit = false;
16430  switch (VT.getSimpleVT().SimpleTy) {
16431  default: return std::make_pair(0, false);
16432  case MVT::v32i8:
16433  case MVT::v16i16:
16434  case MVT::v8i32:
16435    if (!Subtarget->hasAVX2())
16436      NeedSplit = true;
16437    if (!Subtarget->hasAVX())
16438      return std::make_pair(0, false);
16439    break;
16440  case MVT::v16i8:
16441  case MVT::v8i16:
16442  case MVT::v4i32:
16443    if (!Subtarget->hasSSE2())
16444      return std::make_pair(0, false);
16445  }
16446
16447  // SSE2 has only a small subset of the operations.
16448  bool hasUnsigned = Subtarget->hasSSE41() ||
16449                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
16450  bool hasSigned = Subtarget->hasSSE41() ||
16451                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
16452
16453  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16454
16455  unsigned Opc = 0;
16456  // Check for x CC y ? x : y.
16457  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
16458      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
16459    switch (CC) {
16460    default: break;
16461    case ISD::SETULT:
16462    case ISD::SETULE:
16463      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
16464    case ISD::SETUGT:
16465    case ISD::SETUGE:
16466      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
16467    case ISD::SETLT:
16468    case ISD::SETLE:
16469      Opc = hasSigned ? X86ISD::SMIN : 0; break;
16470    case ISD::SETGT:
16471    case ISD::SETGE:
16472      Opc = hasSigned ? X86ISD::SMAX : 0; break;
16473    }
16474  // Check for x CC y ? y : x -- a min/max with reversed arms.
16475  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
16476             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
16477    switch (CC) {
16478    default: break;
16479    case ISD::SETULT:
16480    case ISD::SETULE:
16481      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
16482    case ISD::SETUGT:
16483    case ISD::SETUGE:
16484      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
16485    case ISD::SETLT:
16486    case ISD::SETLE:
16487      Opc = hasSigned ? X86ISD::SMAX : 0; break;
16488    case ISD::SETGT:
16489    case ISD::SETGE:
16490      Opc = hasSigned ? X86ISD::SMIN : 0; break;
16491    }
16492  }
16493
16494  return std::make_pair(Opc, NeedSplit);
16495}
16496
16497/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
16498/// nodes.
16499static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
16500                                    TargetLowering::DAGCombinerInfo &DCI,
16501                                    const X86Subtarget *Subtarget) {
16502  SDLoc DL(N);
16503  SDValue Cond = N->getOperand(0);
16504  // Get the LHS/RHS of the select.
16505  SDValue LHS = N->getOperand(1);
16506  SDValue RHS = N->getOperand(2);
16507  EVT VT = LHS.getValueType();
16508  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16509
16510  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
16511  // instructions match the semantics of the common C idiom x<y?x:y but not
16512  // x<=y?x:y, because of how they handle negative zero (which can be
16513  // ignored in unsafe-math mode).
16514  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
16515      VT != MVT::f80 && TLI.isTypeLegal(VT) &&
16516      (Subtarget->hasSSE2() ||
16517       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
16518    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16519
16520    unsigned Opcode = 0;
16521    // Check for x CC y ? x : y.
16522    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
16523        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
16524      switch (CC) {
16525      default: break;
16526      case ISD::SETULT:
16527        // Converting this to a min would handle NaNs incorrectly, and swapping
16528        // the operands would cause it to handle comparisons between positive
16529        // and negative zero incorrectly.
16530        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
16531          if (!DAG.getTarget().Options.UnsafeFPMath &&
16532              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
16533            break;
16534          std::swap(LHS, RHS);
16535        }
16536        Opcode = X86ISD::FMIN;
16537        break;
16538      case ISD::SETOLE:
16539        // Converting this to a min would handle comparisons between positive
16540        // and negative zero incorrectly.
16541        if (!DAG.getTarget().Options.UnsafeFPMath &&
16542            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
16543          break;
16544        Opcode = X86ISD::FMIN;
16545        break;
16546      case ISD::SETULE:
16547        // Converting this to a min would handle both negative zeros and NaNs
16548        // incorrectly, but we can swap the operands to fix both.
16549        std::swap(LHS, RHS);
16550      case ISD::SETOLT:
16551      case ISD::SETLT:
16552      case ISD::SETLE:
16553        Opcode = X86ISD::FMIN;
16554        break;
16555
16556      case ISD::SETOGE:
16557        // Converting this to a max would handle comparisons between positive
16558        // and negative zero incorrectly.
16559        if (!DAG.getTarget().Options.UnsafeFPMath &&
16560            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
16561          break;
16562        Opcode = X86ISD::FMAX;
16563        break;
16564      case ISD::SETUGT:
16565        // Converting this to a max would handle NaNs incorrectly, and swapping
16566        // the operands would cause it to handle comparisons between positive
16567        // and negative zero incorrectly.
16568        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
16569          if (!DAG.getTarget().Options.UnsafeFPMath &&
16570              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
16571            break;
16572          std::swap(LHS, RHS);
16573        }
16574        Opcode = X86ISD::FMAX;
16575        break;
16576      case ISD::SETUGE:
16577        // Converting this to a max would handle both negative zeros and NaNs
16578        // incorrectly, but we can swap the operands to fix both.
16579        std::swap(LHS, RHS);
16580      case ISD::SETOGT:
16581      case ISD::SETGT:
16582      case ISD::SETGE:
16583        Opcode = X86ISD::FMAX;
16584        break;
16585      }
16586    // Check for x CC y ? y : x -- a min/max with reversed arms.
16587    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
16588               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
16589      switch (CC) {
16590      default: break;
16591      case ISD::SETOGE:
16592        // Converting this to a min would handle comparisons between positive
16593        // and negative zero incorrectly, and swapping the operands would
16594        // cause it to handle NaNs incorrectly.
16595        if (!DAG.getTarget().Options.UnsafeFPMath &&
16596            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
16597          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
16598            break;
16599          std::swap(LHS, RHS);
16600        }
16601        Opcode = X86ISD::FMIN;
16602        break;
16603      case ISD::SETUGT:
16604        // Converting this to a min would handle NaNs incorrectly.
16605        if (!DAG.getTarget().Options.UnsafeFPMath &&
16606            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
16607          break;
16608        Opcode = X86ISD::FMIN;
16609        break;
16610      case ISD::SETUGE:
16611        // Converting this to a min would handle both negative zeros and NaNs
16612        // incorrectly, but we can swap the operands to fix both.
16613        std::swap(LHS, RHS);
16614      case ISD::SETOGT:
16615      case ISD::SETGT:
16616      case ISD::SETGE:
16617        Opcode = X86ISD::FMIN;
16618        break;
16619
16620      case ISD::SETULT:
16621        // Converting this to a max would handle NaNs incorrectly.
16622        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
16623          break;
16624        Opcode = X86ISD::FMAX;
16625        break;
16626      case ISD::SETOLE:
16627        // Converting this to a max would handle comparisons between positive
16628        // and negative zero incorrectly, and swapping the operands would
16629        // cause it to handle NaNs incorrectly.
16630        if (!DAG.getTarget().Options.UnsafeFPMath &&
16631            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
16632          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
16633            break;
16634          std::swap(LHS, RHS);
16635        }
16636        Opcode = X86ISD::FMAX;
16637        break;
16638      case ISD::SETULE:
16639        // Converting this to a max would handle both negative zeros and NaNs
16640        // incorrectly, but we can swap the operands to fix both.
16641        std::swap(LHS, RHS);
16642      case ISD::SETOLT:
16643      case ISD::SETLT:
16644      case ISD::SETLE:
16645        Opcode = X86ISD::FMAX;
16646        break;
16647      }
16648    }
16649
16650    if (Opcode)
16651      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
16652  }
16653
16654  EVT CondVT = Cond.getValueType();
16655  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
16656      CondVT.getVectorElementType() == MVT::i1) {
16657    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
16658    // lowering on AVX-512. In this case we convert it to
16659    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
16660    // The same situation for all 128 and 256-bit vectors of i8 and i16
16661    EVT OpVT = LHS.getValueType();
16662    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
16663        (OpVT.getVectorElementType() == MVT::i8 ||
16664         OpVT.getVectorElementType() == MVT::i16)) {
16665      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
16666      DCI.AddToWorklist(Cond.getNode());
16667      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
16668    }
16669  }
16670  // If this is a select between two integer constants, try to do some
16671  // optimizations.
16672  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
16673    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
16674      // Don't do this for crazy integer types.
16675      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
16676        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
16677        // so that TrueC (the true value) is larger than FalseC.
16678        bool NeedsCondInvert = false;
16679
16680        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
16681            // Efficiently invertible.
16682            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
16683             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
16684              isa<ConstantSDNode>(Cond.getOperand(1))))) {
16685          NeedsCondInvert = true;
16686          std::swap(TrueC, FalseC);
16687        }
16688
16689        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
16690        if (FalseC->getAPIntValue() == 0 &&
16691            TrueC->getAPIntValue().isPowerOf2()) {
16692          if (NeedsCondInvert) // Invert the condition if needed.
16693            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
16694                               DAG.getConstant(1, Cond.getValueType()));
16695
16696          // Zero extend the condition if needed.
16697          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
16698
16699          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
16700          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
16701                             DAG.getConstant(ShAmt, MVT::i8));
16702        }
16703
16704        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
16705        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
16706          if (NeedsCondInvert) // Invert the condition if needed.
16707            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
16708                               DAG.getConstant(1, Cond.getValueType()));
16709
16710          // Zero extend the condition if needed.
16711          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
16712                             FalseC->getValueType(0), Cond);
16713          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
16714                             SDValue(FalseC, 0));
16715        }
16716
16717        // Optimize cases that will turn into an LEA instruction.  This requires
16718        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
16719        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
16720          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
16721          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
16722
16723          bool isFastMultiplier = false;
16724          if (Diff < 10) {
16725            switch ((unsigned char)Diff) {
16726              default: break;
16727              case 1:  // result = add base, cond
16728              case 2:  // result = lea base(    , cond*2)
16729              case 3:  // result = lea base(cond, cond*2)
16730              case 4:  // result = lea base(    , cond*4)
16731              case 5:  // result = lea base(cond, cond*4)
16732              case 8:  // result = lea base(    , cond*8)
16733              case 9:  // result = lea base(cond, cond*8)
16734                isFastMultiplier = true;
16735                break;
16736            }
16737          }
16738
16739          if (isFastMultiplier) {
16740            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
16741            if (NeedsCondInvert) // Invert the condition if needed.
16742              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
16743                                 DAG.getConstant(1, Cond.getValueType()));
16744
16745            // Zero extend the condition if needed.
16746            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
16747                               Cond);
16748            // Scale the condition by the difference.
16749            if (Diff != 1)
16750              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
16751                                 DAG.getConstant(Diff, Cond.getValueType()));
16752
16753            // Add the base if non-zero.
16754            if (FalseC->getAPIntValue() != 0)
16755              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
16756                                 SDValue(FalseC, 0));
16757            return Cond;
16758          }
16759        }
16760      }
16761  }
16762
16763  // Canonicalize max and min:
16764  // (x > y) ? x : y -> (x >= y) ? x : y
16765  // (x < y) ? x : y -> (x <= y) ? x : y
16766  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
16767  // the need for an extra compare
16768  // against zero. e.g.
16769  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
16770  // subl   %esi, %edi
16771  // testl  %edi, %edi
16772  // movl   $0, %eax
16773  // cmovgl %edi, %eax
16774  // =>
16775  // xorl   %eax, %eax
16776  // subl   %esi, $edi
16777  // cmovsl %eax, %edi
16778  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
16779      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
16780      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
16781    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16782    switch (CC) {
16783    default: break;
16784    case ISD::SETLT:
16785    case ISD::SETGT: {
16786      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
16787      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
16788                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
16789      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
16790    }
16791    }
16792  }
16793
16794  // Early exit check
16795  if (!TLI.isTypeLegal(VT))
16796    return SDValue();
16797
16798  // Match VSELECTs into subs with unsigned saturation.
16799  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
16800      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
16801      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
16802       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
16803    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16804
16805    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
16806    // left side invert the predicate to simplify logic below.
16807    SDValue Other;
16808    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
16809      Other = RHS;
16810      CC = ISD::getSetCCInverse(CC, true);
16811    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
16812      Other = LHS;
16813    }
16814
16815    if (Other.getNode() && Other->getNumOperands() == 2 &&
16816        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
16817      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
16818      SDValue CondRHS = Cond->getOperand(1);
16819
16820      // Look for a general sub with unsigned saturation first.
16821      // x >= y ? x-y : 0 --> subus x, y
16822      // x >  y ? x-y : 0 --> subus x, y
16823      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
16824          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
16825        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
16826
16827      // If the RHS is a constant we have to reverse the const canonicalization.
16828      // x > C-1 ? x+-C : 0 --> subus x, C
16829      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
16830          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
16831        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
16832        if (CondRHS.getConstantOperandVal(0) == -A-1)
16833          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
16834                             DAG.getConstant(-A, VT));
16835      }
16836
16837      // Another special case: If C was a sign bit, the sub has been
16838      // canonicalized into a xor.
16839      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
16840      //        it's safe to decanonicalize the xor?
16841      // x s< 0 ? x^C : 0 --> subus x, C
16842      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
16843          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
16844          isSplatVector(OpRHS.getNode())) {
16845        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
16846        if (A.isSignBit())
16847          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
16848      }
16849    }
16850  }
16851
16852  // Try to match a min/max vector operation.
16853  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
16854    std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
16855    unsigned Opc = ret.first;
16856    bool NeedSplit = ret.second;
16857
16858    if (Opc && NeedSplit) {
16859      unsigned NumElems = VT.getVectorNumElements();
16860      // Extract the LHS vectors
16861      SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
16862      SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
16863
16864      // Extract the RHS vectors
16865      SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
16866      SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
16867
16868      // Create min/max for each subvector
16869      LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
16870      RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
16871
16872      // Merge the result
16873      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
16874    } else if (Opc)
16875      return DAG.getNode(Opc, DL, VT, LHS, RHS);
16876  }
16877
16878  // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
16879  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
16880      // Check if SETCC has already been promoted
16881      TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
16882
16883    assert(Cond.getValueType().isVector() &&
16884           "vector select expects a vector selector!");
16885
16886    EVT IntVT = Cond.getValueType();
16887    bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
16888    bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
16889
16890    if (!TValIsAllOnes && !FValIsAllZeros) {
16891      // Try invert the condition if true value is not all 1s and false value
16892      // is not all 0s.
16893      bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
16894      bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
16895
16896      if (TValIsAllZeros || FValIsAllOnes) {
16897        SDValue CC = Cond.getOperand(2);
16898        ISD::CondCode NewCC =
16899          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
16900                               Cond.getOperand(0).getValueType().isInteger());
16901        Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
16902        std::swap(LHS, RHS);
16903        TValIsAllOnes = FValIsAllOnes;
16904        FValIsAllZeros = TValIsAllZeros;
16905      }
16906    }
16907
16908    if (TValIsAllOnes || FValIsAllZeros) {
16909      SDValue Ret;
16910
16911      if (TValIsAllOnes && FValIsAllZeros)
16912        Ret = Cond;
16913      else if (TValIsAllOnes)
16914        Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
16915                          DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
16916      else if (FValIsAllZeros)
16917        Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
16918                          DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
16919
16920      return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
16921    }
16922  }
16923
16924  // If we know that this node is legal then we know that it is going to be
16925  // matched by one of the SSE/AVX BLEND instructions. These instructions only
16926  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
16927  // to simplify previous instructions.
16928  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
16929      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
16930    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
16931
16932    // Don't optimize vector selects that map to mask-registers.
16933    if (BitWidth == 1)
16934      return SDValue();
16935
16936    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
16937    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
16938
16939    APInt KnownZero, KnownOne;
16940    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
16941                                          DCI.isBeforeLegalizeOps());
16942    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
16943        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
16944      DCI.CommitTargetLoweringOpt(TLO);
16945  }
16946
16947  return SDValue();
16948}
16949
16950// Check whether a boolean test is testing a boolean value generated by
16951// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
16952// code.
16953//
16954// Simplify the following patterns:
16955// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
16956// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
16957// to (Op EFLAGS Cond)
16958//
16959// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
16960// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
16961// to (Op EFLAGS !Cond)
16962//
16963// where Op could be BRCOND or CMOV.
16964//
16965static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
16966  // Quit if not CMP and SUB with its value result used.
16967  if (Cmp.getOpcode() != X86ISD::CMP &&
16968      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
16969      return SDValue();
16970
16971  // Quit if not used as a boolean value.
16972  if (CC != X86::COND_E && CC != X86::COND_NE)
16973    return SDValue();
16974
16975  // Check CMP operands. One of them should be 0 or 1 and the other should be
16976  // an SetCC or extended from it.
16977  SDValue Op1 = Cmp.getOperand(0);
16978  SDValue Op2 = Cmp.getOperand(1);
16979
16980  SDValue SetCC;
16981  const ConstantSDNode* C = 0;
16982  bool needOppositeCond = (CC == X86::COND_E);
16983  bool checkAgainstTrue = false; // Is it a comparison against 1?
16984
16985  if ((C = dyn_cast<ConstantSDNode>(Op1)))
16986    SetCC = Op2;
16987  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
16988    SetCC = Op1;
16989  else // Quit if all operands are not constants.
16990    return SDValue();
16991
16992  if (C->getZExtValue() == 1) {
16993    needOppositeCond = !needOppositeCond;
16994    checkAgainstTrue = true;
16995  } else if (C->getZExtValue() != 0)
16996    // Quit if the constant is neither 0 or 1.
16997    return SDValue();
16998
16999  bool truncatedToBoolWithAnd = false;
17000  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
17001  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
17002         SetCC.getOpcode() == ISD::TRUNCATE ||
17003         SetCC.getOpcode() == ISD::AND) {
17004    if (SetCC.getOpcode() == ISD::AND) {
17005      int OpIdx = -1;
17006      ConstantSDNode *CS;
17007      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
17008          CS->getZExtValue() == 1)
17009        OpIdx = 1;
17010      if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
17011          CS->getZExtValue() == 1)
17012        OpIdx = 0;
17013      if (OpIdx == -1)
17014        break;
17015      SetCC = SetCC.getOperand(OpIdx);
17016      truncatedToBoolWithAnd = true;
17017    } else
17018      SetCC = SetCC.getOperand(0);
17019  }
17020
17021  switch (SetCC.getOpcode()) {
17022  case X86ISD::SETCC_CARRY:
17023    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
17024    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
17025    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
17026    // truncated to i1 using 'and'.
17027    if (checkAgainstTrue && !truncatedToBoolWithAnd)
17028      break;
17029    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
17030           "Invalid use of SETCC_CARRY!");
17031    // FALL THROUGH
17032  case X86ISD::SETCC:
17033    // Set the condition code or opposite one if necessary.
17034    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
17035    if (needOppositeCond)
17036      CC = X86::GetOppositeBranchCondition(CC);
17037    return SetCC.getOperand(1);
17038  case X86ISD::CMOV: {
17039    // Check whether false/true value has canonical one, i.e. 0 or 1.
17040    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
17041    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
17042    // Quit if true value is not a constant.
17043    if (!TVal)
17044      return SDValue();
17045    // Quit if false value is not a constant.
17046    if (!FVal) {
17047      SDValue Op = SetCC.getOperand(0);
17048      // Skip 'zext' or 'trunc' node.
17049      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
17050          Op.getOpcode() == ISD::TRUNCATE)
17051        Op = Op.getOperand(0);
17052      // A special case for rdrand/rdseed, where 0 is set if false cond is
17053      // found.
17054      if ((Op.getOpcode() != X86ISD::RDRAND &&
17055           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
17056        return SDValue();
17057    }
17058    // Quit if false value is not the constant 0 or 1.
17059    bool FValIsFalse = true;
17060    if (FVal && FVal->getZExtValue() != 0) {
17061      if (FVal->getZExtValue() != 1)
17062        return SDValue();
17063      // If FVal is 1, opposite cond is needed.
17064      needOppositeCond = !needOppositeCond;
17065      FValIsFalse = false;
17066    }
17067    // Quit if TVal is not the constant opposite of FVal.
17068    if (FValIsFalse && TVal->getZExtValue() != 1)
17069      return SDValue();
17070    if (!FValIsFalse && TVal->getZExtValue() != 0)
17071      return SDValue();
17072    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
17073    if (needOppositeCond)
17074      CC = X86::GetOppositeBranchCondition(CC);
17075    return SetCC.getOperand(3);
17076  }
17077  }
17078
17079  return SDValue();
17080}
17081
17082/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
17083static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
17084                                  TargetLowering::DAGCombinerInfo &DCI,
17085                                  const X86Subtarget *Subtarget) {
17086  SDLoc DL(N);
17087
17088  // If the flag operand isn't dead, don't touch this CMOV.
17089  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
17090    return SDValue();
17091
17092  SDValue FalseOp = N->getOperand(0);
17093  SDValue TrueOp = N->getOperand(1);
17094  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
17095  SDValue Cond = N->getOperand(3);
17096
17097  if (CC == X86::COND_E || CC == X86::COND_NE) {
17098    switch (Cond.getOpcode()) {
17099    default: break;
17100    case X86ISD::BSR:
17101    case X86ISD::BSF:
17102      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
17103      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
17104        return (CC == X86::COND_E) ? FalseOp : TrueOp;
17105    }
17106  }
17107
17108  SDValue Flags;
17109
17110  Flags = checkBoolTestSetCCCombine(Cond, CC);
17111  if (Flags.getNode() &&
17112      // Extra check as FCMOV only supports a subset of X86 cond.
17113      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
17114    SDValue Ops[] = { FalseOp, TrueOp,
17115                      DAG.getConstant(CC, MVT::i8), Flags };
17116    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
17117                       Ops, array_lengthof(Ops));
17118  }
17119
17120  // If this is a select between two integer constants, try to do some
17121  // optimizations.  Note that the operands are ordered the opposite of SELECT
17122  // operands.
17123  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
17124    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
17125      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
17126      // larger than FalseC (the false value).
17127      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
17128        CC = X86::GetOppositeBranchCondition(CC);
17129        std::swap(TrueC, FalseC);
17130        std::swap(TrueOp, FalseOp);
17131      }
17132
17133      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
17134      // This is efficient for any integer data type (including i8/i16) and
17135      // shift amount.
17136      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
17137        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17138                           DAG.getConstant(CC, MVT::i8), Cond);
17139
17140        // Zero extend the condition if needed.
17141        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
17142
17143        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
17144        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
17145                           DAG.getConstant(ShAmt, MVT::i8));
17146        if (N->getNumValues() == 2)  // Dead flag value?
17147          return DCI.CombineTo(N, Cond, SDValue());
17148        return Cond;
17149      }
17150
17151      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
17152      // for any integer data type, including i8/i16.
17153      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
17154        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17155                           DAG.getConstant(CC, MVT::i8), Cond);
17156
17157        // Zero extend the condition if needed.
17158        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
17159                           FalseC->getValueType(0), Cond);
17160        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
17161                           SDValue(FalseC, 0));
17162
17163        if (N->getNumValues() == 2)  // Dead flag value?
17164          return DCI.CombineTo(N, Cond, SDValue());
17165        return Cond;
17166      }
17167
17168      // Optimize cases that will turn into an LEA instruction.  This requires
17169      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
17170      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
17171        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
17172        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
17173
17174        bool isFastMultiplier = false;
17175        if (Diff < 10) {
17176          switch ((unsigned char)Diff) {
17177          default: break;
17178          case 1:  // result = add base, cond
17179          case 2:  // result = lea base(    , cond*2)
17180          case 3:  // result = lea base(cond, cond*2)
17181          case 4:  // result = lea base(    , cond*4)
17182          case 5:  // result = lea base(cond, cond*4)
17183          case 8:  // result = lea base(    , cond*8)
17184          case 9:  // result = lea base(cond, cond*8)
17185            isFastMultiplier = true;
17186            break;
17187          }
17188        }
17189
17190        if (isFastMultiplier) {
17191          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
17192          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
17193                             DAG.getConstant(CC, MVT::i8), Cond);
17194          // Zero extend the condition if needed.
17195          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
17196                             Cond);
17197          // Scale the condition by the difference.
17198          if (Diff != 1)
17199            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
17200                               DAG.getConstant(Diff, Cond.getValueType()));
17201
17202          // Add the base if non-zero.
17203          if (FalseC->getAPIntValue() != 0)
17204            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
17205                               SDValue(FalseC, 0));
17206          if (N->getNumValues() == 2)  // Dead flag value?
17207            return DCI.CombineTo(N, Cond, SDValue());
17208          return Cond;
17209        }
17210      }
17211    }
17212  }
17213
17214  // Handle these cases:
17215  //   (select (x != c), e, c) -> select (x != c), e, x),
17216  //   (select (x == c), c, e) -> select (x == c), x, e)
17217  // where the c is an integer constant, and the "select" is the combination
17218  // of CMOV and CMP.
17219  //
17220  // The rationale for this change is that the conditional-move from a constant
17221  // needs two instructions, however, conditional-move from a register needs
17222  // only one instruction.
17223  //
17224  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
17225  //  some instruction-combining opportunities. This opt needs to be
17226  //  postponed as late as possible.
17227  //
17228  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
17229    // the DCI.xxxx conditions are provided to postpone the optimization as
17230    // late as possible.
17231
17232    ConstantSDNode *CmpAgainst = 0;
17233    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
17234        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
17235        !isa<ConstantSDNode>(Cond.getOperand(0))) {
17236
17237      if (CC == X86::COND_NE &&
17238          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
17239        CC = X86::GetOppositeBranchCondition(CC);
17240        std::swap(TrueOp, FalseOp);
17241      }
17242
17243      if (CC == X86::COND_E &&
17244          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
17245        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
17246                          DAG.getConstant(CC, MVT::i8), Cond };
17247        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
17248                           array_lengthof(Ops));
17249      }
17250    }
17251  }
17252
17253  return SDValue();
17254}
17255
17256/// PerformMulCombine - Optimize a single multiply with constant into two
17257/// in order to implement it with two cheaper instructions, e.g.
17258/// LEA + SHL, LEA + LEA.
17259static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
17260                                 TargetLowering::DAGCombinerInfo &DCI) {
17261  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17262    return SDValue();
17263
17264  EVT VT = N->getValueType(0);
17265  if (VT != MVT::i64)
17266    return SDValue();
17267
17268  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
17269  if (!C)
17270    return SDValue();
17271  uint64_t MulAmt = C->getZExtValue();
17272  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
17273    return SDValue();
17274
17275  uint64_t MulAmt1 = 0;
17276  uint64_t MulAmt2 = 0;
17277  if ((MulAmt % 9) == 0) {
17278    MulAmt1 = 9;
17279    MulAmt2 = MulAmt / 9;
17280  } else if ((MulAmt % 5) == 0) {
17281    MulAmt1 = 5;
17282    MulAmt2 = MulAmt / 5;
17283  } else if ((MulAmt % 3) == 0) {
17284    MulAmt1 = 3;
17285    MulAmt2 = MulAmt / 3;
17286  }
17287  if (MulAmt2 &&
17288      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
17289    SDLoc DL(N);
17290
17291    if (isPowerOf2_64(MulAmt2) &&
17292        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
17293      // If second multiplifer is pow2, issue it first. We want the multiply by
17294      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
17295      // is an add.
17296      std::swap(MulAmt1, MulAmt2);
17297
17298    SDValue NewMul;
17299    if (isPowerOf2_64(MulAmt1))
17300      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
17301                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
17302    else
17303      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
17304                           DAG.getConstant(MulAmt1, VT));
17305
17306    if (isPowerOf2_64(MulAmt2))
17307      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
17308                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
17309    else
17310      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
17311                           DAG.getConstant(MulAmt2, VT));
17312
17313    // Do not add new nodes to DAG combiner worklist.
17314    DCI.CombineTo(N, NewMul, false);
17315  }
17316  return SDValue();
17317}
17318
17319static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
17320  SDValue N0 = N->getOperand(0);
17321  SDValue N1 = N->getOperand(1);
17322  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
17323  EVT VT = N0.getValueType();
17324
17325  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
17326  // since the result of setcc_c is all zero's or all ones.
17327  if (VT.isInteger() && !VT.isVector() &&
17328      N1C && N0.getOpcode() == ISD::AND &&
17329      N0.getOperand(1).getOpcode() == ISD::Constant) {
17330    SDValue N00 = N0.getOperand(0);
17331    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
17332        ((N00.getOpcode() == ISD::ANY_EXTEND ||
17333          N00.getOpcode() == ISD::ZERO_EXTEND) &&
17334         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
17335      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
17336      APInt ShAmt = N1C->getAPIntValue();
17337      Mask = Mask.shl(ShAmt);
17338      if (Mask != 0)
17339        return DAG.getNode(ISD::AND, SDLoc(N), VT,
17340                           N00, DAG.getConstant(Mask, VT));
17341    }
17342  }
17343
17344  // Hardware support for vector shifts is sparse which makes us scalarize the
17345  // vector operations in many cases. Also, on sandybridge ADD is faster than
17346  // shl.
17347  // (shl V, 1) -> add V,V
17348  if (isSplatVector(N1.getNode())) {
17349    assert(N0.getValueType().isVector() && "Invalid vector shift type");
17350    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
17351    // We shift all of the values by one. In many cases we do not have
17352    // hardware support for this operation. This is better expressed as an ADD
17353    // of two values.
17354    if (N1C && (1 == N1C->getZExtValue())) {
17355      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
17356    }
17357  }
17358
17359  return SDValue();
17360}
17361
17362/// \brief Returns a vector of 0s if the node in input is a vector logical
17363/// shift by a constant amount which is known to be bigger than or equal
17364/// to the vector element size in bits.
17365static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
17366                                      const X86Subtarget *Subtarget) {
17367  EVT VT = N->getValueType(0);
17368
17369  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
17370      (!Subtarget->hasInt256() ||
17371       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
17372    return SDValue();
17373
17374  SDValue Amt = N->getOperand(1);
17375  SDLoc DL(N);
17376  if (isSplatVector(Amt.getNode())) {
17377    SDValue SclrAmt = Amt->getOperand(0);
17378    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
17379      APInt ShiftAmt = C->getAPIntValue();
17380      unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
17381
17382      // SSE2/AVX2 logical shifts always return a vector of 0s
17383      // if the shift amount is bigger than or equal to
17384      // the element size. The constant shift amount will be
17385      // encoded as a 8-bit immediate.
17386      if (ShiftAmt.trunc(8).uge(MaxAmount))
17387        return getZeroVector(VT, Subtarget, DAG, DL);
17388    }
17389  }
17390
17391  return SDValue();
17392}
17393
17394/// PerformShiftCombine - Combine shifts.
17395static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
17396                                   TargetLowering::DAGCombinerInfo &DCI,
17397                                   const X86Subtarget *Subtarget) {
17398  if (N->getOpcode() == ISD::SHL) {
17399    SDValue V = PerformSHLCombine(N, DAG);
17400    if (V.getNode()) return V;
17401  }
17402
17403  if (N->getOpcode() != ISD::SRA) {
17404    // Try to fold this logical shift into a zero vector.
17405    SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
17406    if (V.getNode()) return V;
17407  }
17408
17409  return SDValue();
17410}
17411
17412// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
17413// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
17414// and friends.  Likewise for OR -> CMPNEQSS.
17415static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
17416                            TargetLowering::DAGCombinerInfo &DCI,
17417                            const X86Subtarget *Subtarget) {
17418  unsigned opcode;
17419
17420  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
17421  // we're requiring SSE2 for both.
17422  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
17423    SDValue N0 = N->getOperand(0);
17424    SDValue N1 = N->getOperand(1);
17425    SDValue CMP0 = N0->getOperand(1);
17426    SDValue CMP1 = N1->getOperand(1);
17427    SDLoc DL(N);
17428
17429    // The SETCCs should both refer to the same CMP.
17430    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
17431      return SDValue();
17432
17433    SDValue CMP00 = CMP0->getOperand(0);
17434    SDValue CMP01 = CMP0->getOperand(1);
17435    EVT     VT    = CMP00.getValueType();
17436
17437    if (VT == MVT::f32 || VT == MVT::f64) {
17438      bool ExpectingFlags = false;
17439      // Check for any users that want flags:
17440      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
17441           !ExpectingFlags && UI != UE; ++UI)
17442        switch (UI->getOpcode()) {
17443        default:
17444        case ISD::BR_CC:
17445        case ISD::BRCOND:
17446        case ISD::SELECT:
17447          ExpectingFlags = true;
17448          break;
17449        case ISD::CopyToReg:
17450        case ISD::SIGN_EXTEND:
17451        case ISD::ZERO_EXTEND:
17452        case ISD::ANY_EXTEND:
17453          break;
17454        }
17455
17456      if (!ExpectingFlags) {
17457        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
17458        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
17459
17460        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
17461          X86::CondCode tmp = cc0;
17462          cc0 = cc1;
17463          cc1 = tmp;
17464        }
17465
17466        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
17467            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
17468          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
17469          X86ISD::NodeType NTOperator = is64BitFP ?
17470            X86ISD::FSETCCsd : X86ISD::FSETCCss;
17471          // FIXME: need symbolic constants for these magic numbers.
17472          // See X86ATTInstPrinter.cpp:printSSECC().
17473          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
17474          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
17475                                              DAG.getConstant(x86cc, MVT::i8));
17476          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
17477                                              OnesOrZeroesF);
17478          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
17479                                      DAG.getConstant(1, MVT::i32));
17480          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
17481          return OneBitOfTruth;
17482        }
17483      }
17484    }
17485  }
17486  return SDValue();
17487}
17488
17489/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
17490/// so it can be folded inside ANDNP.
17491static bool CanFoldXORWithAllOnes(const SDNode *N) {
17492  EVT VT = N->getValueType(0);
17493
17494  // Match direct AllOnes for 128 and 256-bit vectors
17495  if (ISD::isBuildVectorAllOnes(N))
17496    return true;
17497
17498  // Look through a bit convert.
17499  if (N->getOpcode() == ISD::BITCAST)
17500    N = N->getOperand(0).getNode();
17501
17502  // Sometimes the operand may come from a insert_subvector building a 256-bit
17503  // allones vector
17504  if (VT.is256BitVector() &&
17505      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
17506    SDValue V1 = N->getOperand(0);
17507    SDValue V2 = N->getOperand(1);
17508
17509    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
17510        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
17511        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
17512        ISD::isBuildVectorAllOnes(V2.getNode()))
17513      return true;
17514  }
17515
17516  return false;
17517}
17518
17519// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
17520// register. In most cases we actually compare or select YMM-sized registers
17521// and mixing the two types creates horrible code. This method optimizes
17522// some of the transition sequences.
17523static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
17524                                 TargetLowering::DAGCombinerInfo &DCI,
17525                                 const X86Subtarget *Subtarget) {
17526  EVT VT = N->getValueType(0);
17527  if (!VT.is256BitVector())
17528    return SDValue();
17529
17530  assert((N->getOpcode() == ISD::ANY_EXTEND ||
17531          N->getOpcode() == ISD::ZERO_EXTEND ||
17532          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
17533
17534  SDValue Narrow = N->getOperand(0);
17535  EVT NarrowVT = Narrow->getValueType(0);
17536  if (!NarrowVT.is128BitVector())
17537    return SDValue();
17538
17539  if (Narrow->getOpcode() != ISD::XOR &&
17540      Narrow->getOpcode() != ISD::AND &&
17541      Narrow->getOpcode() != ISD::OR)
17542    return SDValue();
17543
17544  SDValue N0  = Narrow->getOperand(0);
17545  SDValue N1  = Narrow->getOperand(1);
17546  SDLoc DL(Narrow);
17547
17548  // The Left side has to be a trunc.
17549  if (N0.getOpcode() != ISD::TRUNCATE)
17550    return SDValue();
17551
17552  // The type of the truncated inputs.
17553  EVT WideVT = N0->getOperand(0)->getValueType(0);
17554  if (WideVT != VT)
17555    return SDValue();
17556
17557  // The right side has to be a 'trunc' or a constant vector.
17558  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
17559  bool RHSConst = (isSplatVector(N1.getNode()) &&
17560                   isa<ConstantSDNode>(N1->getOperand(0)));
17561  if (!RHSTrunc && !RHSConst)
17562    return SDValue();
17563
17564  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17565
17566  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
17567    return SDValue();
17568
17569  // Set N0 and N1 to hold the inputs to the new wide operation.
17570  N0 = N0->getOperand(0);
17571  if (RHSConst) {
17572    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
17573                     N1->getOperand(0));
17574    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
17575    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
17576  } else if (RHSTrunc) {
17577    N1 = N1->getOperand(0);
17578  }
17579
17580  // Generate the wide operation.
17581  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
17582  unsigned Opcode = N->getOpcode();
17583  switch (Opcode) {
17584  case ISD::ANY_EXTEND:
17585    return Op;
17586  case ISD::ZERO_EXTEND: {
17587    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
17588    APInt Mask = APInt::getAllOnesValue(InBits);
17589    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
17590    return DAG.getNode(ISD::AND, DL, VT,
17591                       Op, DAG.getConstant(Mask, VT));
17592  }
17593  case ISD::SIGN_EXTEND:
17594    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
17595                       Op, DAG.getValueType(NarrowVT));
17596  default:
17597    llvm_unreachable("Unexpected opcode");
17598  }
17599}
17600
17601static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
17602                                 TargetLowering::DAGCombinerInfo &DCI,
17603                                 const X86Subtarget *Subtarget) {
17604  EVT VT = N->getValueType(0);
17605  if (DCI.isBeforeLegalizeOps())
17606    return SDValue();
17607
17608  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
17609  if (R.getNode())
17610    return R;
17611
17612  // Create BLSI, BLSR, and BZHI instructions
17613  // BLSI is X & (-X)
17614  // BLSR is X & (X-1)
17615  // BZHI is X & ((1 << Y) - 1)
17616  // BEXTR is ((X >> imm) & (2**size-1))
17617  if (VT == MVT::i32 || VT == MVT::i64) {
17618    SDValue N0 = N->getOperand(0);
17619    SDValue N1 = N->getOperand(1);
17620    SDLoc DL(N);
17621
17622    if (Subtarget->hasBMI()) {
17623      // Check LHS for neg
17624      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
17625          isZero(N0.getOperand(0)))
17626        return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
17627
17628      // Check RHS for neg
17629      if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
17630          isZero(N1.getOperand(0)))
17631        return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
17632
17633      // Check LHS for X-1
17634      if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
17635          isAllOnes(N0.getOperand(1)))
17636        return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
17637
17638      // Check RHS for X-1
17639      if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
17640          isAllOnes(N1.getOperand(1)))
17641        return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
17642    }
17643
17644    if (Subtarget->hasBMI2()) {
17645      // Check for (and (add (shl 1, Y), -1), X)
17646      if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
17647        SDValue N00 = N0.getOperand(0);
17648        if (N00.getOpcode() == ISD::SHL) {
17649          SDValue N001 = N00.getOperand(1);
17650          assert(N001.getValueType() == MVT::i8 && "unexpected type");
17651          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
17652          if (C && C->getZExtValue() == 1)
17653            return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
17654        }
17655      }
17656
17657      // Check for (and X, (add (shl 1, Y), -1))
17658      if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
17659        SDValue N10 = N1.getOperand(0);
17660        if (N10.getOpcode() == ISD::SHL) {
17661          SDValue N101 = N10.getOperand(1);
17662          assert(N101.getValueType() == MVT::i8 && "unexpected type");
17663          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
17664          if (C && C->getZExtValue() == 1)
17665            return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
17666        }
17667      }
17668    }
17669
17670    // Check for BEXTR.
17671    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
17672        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
17673      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
17674      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
17675      if (MaskNode && ShiftNode) {
17676        uint64_t Mask = MaskNode->getZExtValue();
17677        uint64_t Shift = ShiftNode->getZExtValue();
17678        if (isMask_64(Mask)) {
17679          uint64_t MaskSize = CountPopulation_64(Mask);
17680          if (Shift + MaskSize <= VT.getSizeInBits())
17681            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
17682                               DAG.getConstant(Shift | (MaskSize << 8), VT));
17683        }
17684      }
17685    } // BEXTR
17686
17687    return SDValue();
17688  }
17689
17690  // Want to form ANDNP nodes:
17691  // 1) In the hopes of then easily combining them with OR and AND nodes
17692  //    to form PBLEND/PSIGN.
17693  // 2) To match ANDN packed intrinsics
17694  if (VT != MVT::v2i64 && VT != MVT::v4i64)
17695    return SDValue();
17696
17697  SDValue N0 = N->getOperand(0);
17698  SDValue N1 = N->getOperand(1);
17699  SDLoc DL(N);
17700
17701  // Check LHS for vnot
17702  if (N0.getOpcode() == ISD::XOR &&
17703      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
17704      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
17705    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
17706
17707  // Check RHS for vnot
17708  if (N1.getOpcode() == ISD::XOR &&
17709      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
17710      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
17711    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
17712
17713  return SDValue();
17714}
17715
17716static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
17717                                TargetLowering::DAGCombinerInfo &DCI,
17718                                const X86Subtarget *Subtarget) {
17719  EVT VT = N->getValueType(0);
17720  if (DCI.isBeforeLegalizeOps())
17721    return SDValue();
17722
17723  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
17724  if (R.getNode())
17725    return R;
17726
17727  SDValue N0 = N->getOperand(0);
17728  SDValue N1 = N->getOperand(1);
17729
17730  // look for psign/blend
17731  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
17732    if (!Subtarget->hasSSSE3() ||
17733        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
17734      return SDValue();
17735
17736    // Canonicalize pandn to RHS
17737    if (N0.getOpcode() == X86ISD::ANDNP)
17738      std::swap(N0, N1);
17739    // or (and (m, y), (pandn m, x))
17740    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
17741      SDValue Mask = N1.getOperand(0);
17742      SDValue X    = N1.getOperand(1);
17743      SDValue Y;
17744      if (N0.getOperand(0) == Mask)
17745        Y = N0.getOperand(1);
17746      if (N0.getOperand(1) == Mask)
17747        Y = N0.getOperand(0);
17748
17749      // Check to see if the mask appeared in both the AND and ANDNP and
17750      if (!Y.getNode())
17751        return SDValue();
17752
17753      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
17754      // Look through mask bitcast.
17755      if (Mask.getOpcode() == ISD::BITCAST)
17756        Mask = Mask.getOperand(0);
17757      if (X.getOpcode() == ISD::BITCAST)
17758        X = X.getOperand(0);
17759      if (Y.getOpcode() == ISD::BITCAST)
17760        Y = Y.getOperand(0);
17761
17762      EVT MaskVT = Mask.getValueType();
17763
17764      // Validate that the Mask operand is a vector sra node.
17765      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
17766      // there is no psrai.b
17767      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
17768      unsigned SraAmt = ~0;
17769      if (Mask.getOpcode() == ISD::SRA) {
17770        SDValue Amt = Mask.getOperand(1);
17771        if (isSplatVector(Amt.getNode())) {
17772          SDValue SclrAmt = Amt->getOperand(0);
17773          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
17774            SraAmt = C->getZExtValue();
17775        }
17776      } else if (Mask.getOpcode() == X86ISD::VSRAI) {
17777        SDValue SraC = Mask.getOperand(1);
17778        SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
17779      }
17780      if ((SraAmt + 1) != EltBits)
17781        return SDValue();
17782
17783      SDLoc DL(N);
17784
17785      // Now we know we at least have a plendvb with the mask val.  See if
17786      // we can form a psignb/w/d.
17787      // psign = x.type == y.type == mask.type && y = sub(0, x);
17788      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
17789          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
17790          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
17791        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
17792               "Unsupported VT for PSIGN");
17793        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
17794        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
17795      }
17796      // PBLENDVB only available on SSE 4.1
17797      if (!Subtarget->hasSSE41())
17798        return SDValue();
17799
17800      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
17801
17802      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
17803      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
17804      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
17805      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
17806      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
17807    }
17808  }
17809
17810  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
17811    return SDValue();
17812
17813  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
17814  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
17815    std::swap(N0, N1);
17816  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
17817    return SDValue();
17818  if (!N0.hasOneUse() || !N1.hasOneUse())
17819    return SDValue();
17820
17821  SDValue ShAmt0 = N0.getOperand(1);
17822  if (ShAmt0.getValueType() != MVT::i8)
17823    return SDValue();
17824  SDValue ShAmt1 = N1.getOperand(1);
17825  if (ShAmt1.getValueType() != MVT::i8)
17826    return SDValue();
17827  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
17828    ShAmt0 = ShAmt0.getOperand(0);
17829  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
17830    ShAmt1 = ShAmt1.getOperand(0);
17831
17832  SDLoc DL(N);
17833  unsigned Opc = X86ISD::SHLD;
17834  SDValue Op0 = N0.getOperand(0);
17835  SDValue Op1 = N1.getOperand(0);
17836  if (ShAmt0.getOpcode() == ISD::SUB) {
17837    Opc = X86ISD::SHRD;
17838    std::swap(Op0, Op1);
17839    std::swap(ShAmt0, ShAmt1);
17840  }
17841
17842  unsigned Bits = VT.getSizeInBits();
17843  if (ShAmt1.getOpcode() == ISD::SUB) {
17844    SDValue Sum = ShAmt1.getOperand(0);
17845    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
17846      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
17847      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
17848        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
17849      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
17850        return DAG.getNode(Opc, DL, VT,
17851                           Op0, Op1,
17852                           DAG.getNode(ISD::TRUNCATE, DL,
17853                                       MVT::i8, ShAmt0));
17854    }
17855  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
17856    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
17857    if (ShAmt0C &&
17858        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
17859      return DAG.getNode(Opc, DL, VT,
17860                         N0.getOperand(0), N1.getOperand(0),
17861                         DAG.getNode(ISD::TRUNCATE, DL,
17862                                       MVT::i8, ShAmt0));
17863  }
17864
17865  return SDValue();
17866}
17867
17868// Generate NEG and CMOV for integer abs.
17869static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
17870  EVT VT = N->getValueType(0);
17871
17872  // Since X86 does not have CMOV for 8-bit integer, we don't convert
17873  // 8-bit integer abs to NEG and CMOV.
17874  if (VT.isInteger() && VT.getSizeInBits() == 8)
17875    return SDValue();
17876
17877  SDValue N0 = N->getOperand(0);
17878  SDValue N1 = N->getOperand(1);
17879  SDLoc DL(N);
17880
17881  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
17882  // and change it to SUB and CMOV.
17883  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
17884      N0.getOpcode() == ISD::ADD &&
17885      N0.getOperand(1) == N1 &&
17886      N1.getOpcode() == ISD::SRA &&
17887      N1.getOperand(0) == N0.getOperand(0))
17888    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
17889      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
17890        // Generate SUB & CMOV.
17891        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
17892                                  DAG.getConstant(0, VT), N0.getOperand(0));
17893
17894        SDValue Ops[] = { N0.getOperand(0), Neg,
17895                          DAG.getConstant(X86::COND_GE, MVT::i8),
17896                          SDValue(Neg.getNode(), 1) };
17897        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
17898                           Ops, array_lengthof(Ops));
17899      }
17900  return SDValue();
17901}
17902
17903// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
17904static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
17905                                 TargetLowering::DAGCombinerInfo &DCI,
17906                                 const X86Subtarget *Subtarget) {
17907  EVT VT = N->getValueType(0);
17908  if (DCI.isBeforeLegalizeOps())
17909    return SDValue();
17910
17911  if (Subtarget->hasCMov()) {
17912    SDValue RV = performIntegerAbsCombine(N, DAG);
17913    if (RV.getNode())
17914      return RV;
17915  }
17916
17917  // Try forming BMI if it is available.
17918  if (!Subtarget->hasBMI())
17919    return SDValue();
17920
17921  if (VT != MVT::i32 && VT != MVT::i64)
17922    return SDValue();
17923
17924  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
17925
17926  // Create BLSMSK instructions by finding X ^ (X-1)
17927  SDValue N0 = N->getOperand(0);
17928  SDValue N1 = N->getOperand(1);
17929  SDLoc DL(N);
17930
17931  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
17932      isAllOnes(N0.getOperand(1)))
17933    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
17934
17935  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
17936      isAllOnes(N1.getOperand(1)))
17937    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
17938
17939  return SDValue();
17940}
17941
17942/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
17943static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
17944                                  TargetLowering::DAGCombinerInfo &DCI,
17945                                  const X86Subtarget *Subtarget) {
17946  LoadSDNode *Ld = cast<LoadSDNode>(N);
17947  EVT RegVT = Ld->getValueType(0);
17948  EVT MemVT = Ld->getMemoryVT();
17949  SDLoc dl(Ld);
17950  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17951  unsigned RegSz = RegVT.getSizeInBits();
17952
17953  // On Sandybridge unaligned 256bit loads are inefficient.
17954  ISD::LoadExtType Ext = Ld->getExtensionType();
17955  unsigned Alignment = Ld->getAlignment();
17956  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
17957  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
17958      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
17959    unsigned NumElems = RegVT.getVectorNumElements();
17960    if (NumElems < 2)
17961      return SDValue();
17962
17963    SDValue Ptr = Ld->getBasePtr();
17964    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
17965
17966    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
17967                                  NumElems/2);
17968    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
17969                                Ld->getPointerInfo(), Ld->isVolatile(),
17970                                Ld->isNonTemporal(), Ld->isInvariant(),
17971                                Alignment);
17972    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17973    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
17974                                Ld->getPointerInfo(), Ld->isVolatile(),
17975                                Ld->isNonTemporal(), Ld->isInvariant(),
17976                                std::min(16U, Alignment));
17977    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
17978                             Load1.getValue(1),
17979                             Load2.getValue(1));
17980
17981    SDValue NewVec = DAG.getUNDEF(RegVT);
17982    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
17983    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
17984    return DCI.CombineTo(N, NewVec, TF, true);
17985  }
17986
17987  // If this is a vector EXT Load then attempt to optimize it using a
17988  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
17989  // expansion is still better than scalar code.
17990  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
17991  // emit a shuffle and a arithmetic shift.
17992  // TODO: It is possible to support ZExt by zeroing the undef values
17993  // during the shuffle phase or after the shuffle.
17994  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
17995      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
17996    assert(MemVT != RegVT && "Cannot extend to the same type");
17997    assert(MemVT.isVector() && "Must load a vector from memory");
17998
17999    unsigned NumElems = RegVT.getVectorNumElements();
18000    unsigned MemSz = MemVT.getSizeInBits();
18001    assert(RegSz > MemSz && "Register size must be greater than the mem size");
18002
18003    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
18004      return SDValue();
18005
18006    // All sizes must be a power of two.
18007    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
18008      return SDValue();
18009
18010    // Attempt to load the original value using scalar loads.
18011    // Find the largest scalar type that divides the total loaded size.
18012    MVT SclrLoadTy = MVT::i8;
18013    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
18014         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
18015      MVT Tp = (MVT::SimpleValueType)tp;
18016      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18017        SclrLoadTy = Tp;
18018      }
18019    }
18020
18021    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18022    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18023        (64 <= MemSz))
18024      SclrLoadTy = MVT::f64;
18025
18026    // Calculate the number of scalar loads that we need to perform
18027    // in order to load our vector from memory.
18028    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18029    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
18030      return SDValue();
18031
18032    unsigned loadRegZize = RegSz;
18033    if (Ext == ISD::SEXTLOAD && RegSz == 256)
18034      loadRegZize /= 2;
18035
18036    // Represent our vector as a sequence of elements which are the
18037    // largest scalar that we can load.
18038    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
18039      loadRegZize/SclrLoadTy.getSizeInBits());
18040
18041    // Represent the data using the same element type that is stored in
18042    // memory. In practice, we ''widen'' MemVT.
18043    EVT WideVecVT =
18044          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18045                       loadRegZize/MemVT.getScalarType().getSizeInBits());
18046
18047    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18048      "Invalid vector type");
18049
18050    // We can't shuffle using an illegal type.
18051    if (!TLI.isTypeLegal(WideVecVT))
18052      return SDValue();
18053
18054    SmallVector<SDValue, 8> Chains;
18055    SDValue Ptr = Ld->getBasePtr();
18056    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
18057                                        TLI.getPointerTy());
18058    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18059
18060    for (unsigned i = 0; i < NumLoads; ++i) {
18061      // Perform a single load.
18062      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
18063                                       Ptr, Ld->getPointerInfo(),
18064                                       Ld->isVolatile(), Ld->isNonTemporal(),
18065                                       Ld->isInvariant(), Ld->getAlignment());
18066      Chains.push_back(ScalarLoad.getValue(1));
18067      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18068      // another round of DAGCombining.
18069      if (i == 0)
18070        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18071      else
18072        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18073                          ScalarLoad, DAG.getIntPtrConstant(i));
18074
18075      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18076    }
18077
18078    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
18079                               Chains.size());
18080
18081    // Bitcast the loaded value to a vector of the original element type, in
18082    // the size of the target vector type.
18083    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
18084    unsigned SizeRatio = RegSz/MemSz;
18085
18086    if (Ext == ISD::SEXTLOAD) {
18087      // If we have SSE4.1 we can directly emit a VSEXT node.
18088      if (Subtarget->hasSSE41()) {
18089        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
18090        return DCI.CombineTo(N, Sext, TF, true);
18091      }
18092
18093      // Otherwise we'll shuffle the small elements in the high bits of the
18094      // larger type and perform an arithmetic shift. If the shift is not legal
18095      // it's better to scalarize.
18096      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
18097        return SDValue();
18098
18099      // Redistribute the loaded elements into the different locations.
18100      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
18101      for (unsigned i = 0; i != NumElems; ++i)
18102        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
18103
18104      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18105                                           DAG.getUNDEF(WideVecVT),
18106                                           &ShuffleVec[0]);
18107
18108      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
18109
18110      // Build the arithmetic shift.
18111      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
18112                     MemVT.getVectorElementType().getSizeInBits();
18113      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
18114                          DAG.getConstant(Amt, RegVT));
18115
18116      return DCI.CombineTo(N, Shuff, TF, true);
18117    }
18118
18119    // Redistribute the loaded elements into the different locations.
18120    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
18121    for (unsigned i = 0; i != NumElems; ++i)
18122      ShuffleVec[i*SizeRatio] = i;
18123
18124    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18125                                         DAG.getUNDEF(WideVecVT),
18126                                         &ShuffleVec[0]);
18127
18128    // Bitcast to the requested type.
18129    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
18130    // Replace the original load with the new sequence
18131    // and return the new chain.
18132    return DCI.CombineTo(N, Shuff, TF, true);
18133  }
18134
18135  return SDValue();
18136}
18137
18138/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
18139static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
18140                                   const X86Subtarget *Subtarget) {
18141  StoreSDNode *St = cast<StoreSDNode>(N);
18142  EVT VT = St->getValue().getValueType();
18143  EVT StVT = St->getMemoryVT();
18144  SDLoc dl(St);
18145  SDValue StoredVal = St->getOperand(1);
18146  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18147
18148  // If we are saving a concatenation of two XMM registers, perform two stores.
18149  // On Sandy Bridge, 256-bit memory operations are executed by two
18150  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
18151  // memory  operation.
18152  unsigned Alignment = St->getAlignment();
18153  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
18154  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
18155      StVT == VT && !IsAligned) {
18156    unsigned NumElems = VT.getVectorNumElements();
18157    if (NumElems < 2)
18158      return SDValue();
18159
18160    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
18161    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
18162
18163    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
18164    SDValue Ptr0 = St->getBasePtr();
18165    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
18166
18167    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
18168                                St->getPointerInfo(), St->isVolatile(),
18169                                St->isNonTemporal(), Alignment);
18170    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
18171                                St->getPointerInfo(), St->isVolatile(),
18172                                St->isNonTemporal(),
18173                                std::min(16U, Alignment));
18174    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
18175  }
18176
18177  // Optimize trunc store (of multiple scalars) to shuffle and store.
18178  // First, pack all of the elements in one place. Next, store to memory
18179  // in fewer chunks.
18180  if (St->isTruncatingStore() && VT.isVector()) {
18181    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18182    unsigned NumElems = VT.getVectorNumElements();
18183    assert(StVT != VT && "Cannot truncate to the same type");
18184    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
18185    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
18186
18187    // From, To sizes and ElemCount must be pow of two
18188    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
18189    // We are going to use the original vector elt for storing.
18190    // Accumulated smaller vector elements must be a multiple of the store size.
18191    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
18192
18193    unsigned SizeRatio  = FromSz / ToSz;
18194
18195    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
18196
18197    // Create a type on which we perform the shuffle
18198    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
18199            StVT.getScalarType(), NumElems*SizeRatio);
18200
18201    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
18202
18203    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
18204    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
18205    for (unsigned i = 0; i != NumElems; ++i)
18206      ShuffleVec[i] = i * SizeRatio;
18207
18208    // Can't shuffle using an illegal type.
18209    if (!TLI.isTypeLegal(WideVecVT))
18210      return SDValue();
18211
18212    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
18213                                         DAG.getUNDEF(WideVecVT),
18214                                         &ShuffleVec[0]);
18215    // At this point all of the data is stored at the bottom of the
18216    // register. We now need to save it to mem.
18217
18218    // Find the largest store unit
18219    MVT StoreType = MVT::i8;
18220    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
18221         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
18222      MVT Tp = (MVT::SimpleValueType)tp;
18223      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
18224        StoreType = Tp;
18225    }
18226
18227    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18228    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
18229        (64 <= NumElems * ToSz))
18230      StoreType = MVT::f64;
18231
18232    // Bitcast the original vector into a vector of store-size units
18233    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
18234            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
18235    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
18236    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
18237    SmallVector<SDValue, 8> Chains;
18238    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
18239                                        TLI.getPointerTy());
18240    SDValue Ptr = St->getBasePtr();
18241
18242    // Perform one or more big stores into memory.
18243    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
18244      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
18245                                   StoreType, ShuffWide,
18246                                   DAG.getIntPtrConstant(i));
18247      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
18248                                St->getPointerInfo(), St->isVolatile(),
18249                                St->isNonTemporal(), St->getAlignment());
18250      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18251      Chains.push_back(Ch);
18252    }
18253
18254    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
18255                               Chains.size());
18256  }
18257
18258  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
18259  // the FP state in cases where an emms may be missing.
18260  // A preferable solution to the general problem is to figure out the right
18261  // places to insert EMMS.  This qualifies as a quick hack.
18262
18263  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
18264  if (VT.getSizeInBits() != 64)
18265    return SDValue();
18266
18267  const Function *F = DAG.getMachineFunction().getFunction();
18268  bool NoImplicitFloatOps = F->getAttributes().
18269    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
18270  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
18271                     && Subtarget->hasSSE2();
18272  if ((VT.isVector() ||
18273       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
18274      isa<LoadSDNode>(St->getValue()) &&
18275      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
18276      St->getChain().hasOneUse() && !St->isVolatile()) {
18277    SDNode* LdVal = St->getValue().getNode();
18278    LoadSDNode *Ld = 0;
18279    int TokenFactorIndex = -1;
18280    SmallVector<SDValue, 8> Ops;
18281    SDNode* ChainVal = St->getChain().getNode();
18282    // Must be a store of a load.  We currently handle two cases:  the load
18283    // is a direct child, and it's under an intervening TokenFactor.  It is
18284    // possible to dig deeper under nested TokenFactors.
18285    if (ChainVal == LdVal)
18286      Ld = cast<LoadSDNode>(St->getChain());
18287    else if (St->getValue().hasOneUse() &&
18288             ChainVal->getOpcode() == ISD::TokenFactor) {
18289      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
18290        if (ChainVal->getOperand(i).getNode() == LdVal) {
18291          TokenFactorIndex = i;
18292          Ld = cast<LoadSDNode>(St->getValue());
18293        } else
18294          Ops.push_back(ChainVal->getOperand(i));
18295      }
18296    }
18297
18298    if (!Ld || !ISD::isNormalLoad(Ld))
18299      return SDValue();
18300
18301    // If this is not the MMX case, i.e. we are just turning i64 load/store
18302    // into f64 load/store, avoid the transformation if there are multiple
18303    // uses of the loaded value.
18304    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
18305      return SDValue();
18306
18307    SDLoc LdDL(Ld);
18308    SDLoc StDL(N);
18309    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
18310    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
18311    // pair instead.
18312    if (Subtarget->is64Bit() || F64IsLegal) {
18313      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
18314      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
18315                                  Ld->getPointerInfo(), Ld->isVolatile(),
18316                                  Ld->isNonTemporal(), Ld->isInvariant(),
18317                                  Ld->getAlignment());
18318      SDValue NewChain = NewLd.getValue(1);
18319      if (TokenFactorIndex != -1) {
18320        Ops.push_back(NewChain);
18321        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
18322                               Ops.size());
18323      }
18324      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
18325                          St->getPointerInfo(),
18326                          St->isVolatile(), St->isNonTemporal(),
18327                          St->getAlignment());
18328    }
18329
18330    // Otherwise, lower to two pairs of 32-bit loads / stores.
18331    SDValue LoAddr = Ld->getBasePtr();
18332    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
18333                                 DAG.getConstant(4, MVT::i32));
18334
18335    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
18336                               Ld->getPointerInfo(),
18337                               Ld->isVolatile(), Ld->isNonTemporal(),
18338                               Ld->isInvariant(), Ld->getAlignment());
18339    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
18340                               Ld->getPointerInfo().getWithOffset(4),
18341                               Ld->isVolatile(), Ld->isNonTemporal(),
18342                               Ld->isInvariant(),
18343                               MinAlign(Ld->getAlignment(), 4));
18344
18345    SDValue NewChain = LoLd.getValue(1);
18346    if (TokenFactorIndex != -1) {
18347      Ops.push_back(LoLd);
18348      Ops.push_back(HiLd);
18349      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
18350                             Ops.size());
18351    }
18352
18353    LoAddr = St->getBasePtr();
18354    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
18355                         DAG.getConstant(4, MVT::i32));
18356
18357    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
18358                                St->getPointerInfo(),
18359                                St->isVolatile(), St->isNonTemporal(),
18360                                St->getAlignment());
18361    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
18362                                St->getPointerInfo().getWithOffset(4),
18363                                St->isVolatile(),
18364                                St->isNonTemporal(),
18365                                MinAlign(St->getAlignment(), 4));
18366    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
18367  }
18368  return SDValue();
18369}
18370
18371/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
18372/// and return the operands for the horizontal operation in LHS and RHS.  A
18373/// horizontal operation performs the binary operation on successive elements
18374/// of its first operand, then on successive elements of its second operand,
18375/// returning the resulting values in a vector.  For example, if
18376///   A = < float a0, float a1, float a2, float a3 >
18377/// and
18378///   B = < float b0, float b1, float b2, float b3 >
18379/// then the result of doing a horizontal operation on A and B is
18380///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
18381/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
18382/// A horizontal-op B, for some already available A and B, and if so then LHS is
18383/// set to A, RHS to B, and the routine returns 'true'.
18384/// Note that the binary operation should have the property that if one of the
18385/// operands is UNDEF then the result is UNDEF.
18386static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
18387  // Look for the following pattern: if
18388  //   A = < float a0, float a1, float a2, float a3 >
18389  //   B = < float b0, float b1, float b2, float b3 >
18390  // and
18391  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
18392  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
18393  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
18394  // which is A horizontal-op B.
18395
18396  // At least one of the operands should be a vector shuffle.
18397  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
18398      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
18399    return false;
18400
18401  MVT VT = LHS.getSimpleValueType();
18402
18403  assert((VT.is128BitVector() || VT.is256BitVector()) &&
18404         "Unsupported vector type for horizontal add/sub");
18405
18406  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
18407  // operate independently on 128-bit lanes.
18408  unsigned NumElts = VT.getVectorNumElements();
18409  unsigned NumLanes = VT.getSizeInBits()/128;
18410  unsigned NumLaneElts = NumElts / NumLanes;
18411  assert((NumLaneElts % 2 == 0) &&
18412         "Vector type should have an even number of elements in each lane");
18413  unsigned HalfLaneElts = NumLaneElts/2;
18414
18415  // View LHS in the form
18416  //   LHS = VECTOR_SHUFFLE A, B, LMask
18417  // If LHS is not a shuffle then pretend it is the shuffle
18418  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
18419  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
18420  // type VT.
18421  SDValue A, B;
18422  SmallVector<int, 16> LMask(NumElts);
18423  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
18424    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
18425      A = LHS.getOperand(0);
18426    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
18427      B = LHS.getOperand(1);
18428    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
18429    std::copy(Mask.begin(), Mask.end(), LMask.begin());
18430  } else {
18431    if (LHS.getOpcode() != ISD::UNDEF)
18432      A = LHS;
18433    for (unsigned i = 0; i != NumElts; ++i)
18434      LMask[i] = i;
18435  }
18436
18437  // Likewise, view RHS in the form
18438  //   RHS = VECTOR_SHUFFLE C, D, RMask
18439  SDValue C, D;
18440  SmallVector<int, 16> RMask(NumElts);
18441  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
18442    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
18443      C = RHS.getOperand(0);
18444    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
18445      D = RHS.getOperand(1);
18446    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
18447    std::copy(Mask.begin(), Mask.end(), RMask.begin());
18448  } else {
18449    if (RHS.getOpcode() != ISD::UNDEF)
18450      C = RHS;
18451    for (unsigned i = 0; i != NumElts; ++i)
18452      RMask[i] = i;
18453  }
18454
18455  // Check that the shuffles are both shuffling the same vectors.
18456  if (!(A == C && B == D) && !(A == D && B == C))
18457    return false;
18458
18459  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
18460  if (!A.getNode() && !B.getNode())
18461    return false;
18462
18463  // If A and B occur in reverse order in RHS, then "swap" them (which means
18464  // rewriting the mask).
18465  if (A != C)
18466    CommuteVectorShuffleMask(RMask, NumElts);
18467
18468  // At this point LHS and RHS are equivalent to
18469  //   LHS = VECTOR_SHUFFLE A, B, LMask
18470  //   RHS = VECTOR_SHUFFLE A, B, RMask
18471  // Check that the masks correspond to performing a horizontal operation.
18472  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
18473    for (unsigned i = 0; i != NumLaneElts; ++i) {
18474      int LIdx = LMask[i+l], RIdx = RMask[i+l];
18475
18476      // Ignore any UNDEF components.
18477      if (LIdx < 0 || RIdx < 0 ||
18478          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
18479          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
18480        continue;
18481
18482      // Check that successive elements are being operated on.  If not, this is
18483      // not a horizontal operation.
18484      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
18485      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
18486      if (!(LIdx == Index && RIdx == Index + 1) &&
18487          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
18488        return false;
18489    }
18490  }
18491
18492  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
18493  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
18494  return true;
18495}
18496
18497/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
18498static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
18499                                  const X86Subtarget *Subtarget) {
18500  EVT VT = N->getValueType(0);
18501  SDValue LHS = N->getOperand(0);
18502  SDValue RHS = N->getOperand(1);
18503
18504  // Try to synthesize horizontal adds from adds of shuffles.
18505  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
18506       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
18507      isHorizontalBinOp(LHS, RHS, true))
18508    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
18509  return SDValue();
18510}
18511
18512/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
18513static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
18514                                  const X86Subtarget *Subtarget) {
18515  EVT VT = N->getValueType(0);
18516  SDValue LHS = N->getOperand(0);
18517  SDValue RHS = N->getOperand(1);
18518
18519  // Try to synthesize horizontal subs from subs of shuffles.
18520  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
18521       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
18522      isHorizontalBinOp(LHS, RHS, false))
18523    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
18524  return SDValue();
18525}
18526
18527/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
18528/// X86ISD::FXOR nodes.
18529static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
18530  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
18531  // F[X]OR(0.0, x) -> x
18532  // F[X]OR(x, 0.0) -> x
18533  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
18534    if (C->getValueAPF().isPosZero())
18535      return N->getOperand(1);
18536  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
18537    if (C->getValueAPF().isPosZero())
18538      return N->getOperand(0);
18539  return SDValue();
18540}
18541
18542/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
18543/// X86ISD::FMAX nodes.
18544static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
18545  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
18546
18547  // Only perform optimizations if UnsafeMath is used.
18548  if (!DAG.getTarget().Options.UnsafeFPMath)
18549    return SDValue();
18550
18551  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
18552  // into FMINC and FMAXC, which are Commutative operations.
18553  unsigned NewOp = 0;
18554  switch (N->getOpcode()) {
18555    default: llvm_unreachable("unknown opcode");
18556    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
18557    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
18558  }
18559
18560  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
18561                     N->getOperand(0), N->getOperand(1));
18562}
18563
18564/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
18565static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
18566  // FAND(0.0, x) -> 0.0
18567  // FAND(x, 0.0) -> 0.0
18568  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
18569    if (C->getValueAPF().isPosZero())
18570      return N->getOperand(0);
18571  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
18572    if (C->getValueAPF().isPosZero())
18573      return N->getOperand(1);
18574  return SDValue();
18575}
18576
18577/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
18578static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
18579  // FANDN(x, 0.0) -> 0.0
18580  // FANDN(0.0, x) -> x
18581  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
18582    if (C->getValueAPF().isPosZero())
18583      return N->getOperand(1);
18584  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
18585    if (C->getValueAPF().isPosZero())
18586      return N->getOperand(1);
18587  return SDValue();
18588}
18589
18590static SDValue PerformBTCombine(SDNode *N,
18591                                SelectionDAG &DAG,
18592                                TargetLowering::DAGCombinerInfo &DCI) {
18593  // BT ignores high bits in the bit index operand.
18594  SDValue Op1 = N->getOperand(1);
18595  if (Op1.hasOneUse()) {
18596    unsigned BitWidth = Op1.getValueSizeInBits();
18597    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
18598    APInt KnownZero, KnownOne;
18599    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
18600                                          !DCI.isBeforeLegalizeOps());
18601    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18602    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
18603        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
18604      DCI.CommitTargetLoweringOpt(TLO);
18605  }
18606  return SDValue();
18607}
18608
18609static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
18610  SDValue Op = N->getOperand(0);
18611  if (Op.getOpcode() == ISD::BITCAST)
18612    Op = Op.getOperand(0);
18613  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
18614  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
18615      VT.getVectorElementType().getSizeInBits() ==
18616      OpVT.getVectorElementType().getSizeInBits()) {
18617    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
18618  }
18619  return SDValue();
18620}
18621
18622static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
18623                                               const X86Subtarget *Subtarget) {
18624  EVT VT = N->getValueType(0);
18625  if (!VT.isVector())
18626    return SDValue();
18627
18628  SDValue N0 = N->getOperand(0);
18629  SDValue N1 = N->getOperand(1);
18630  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
18631  SDLoc dl(N);
18632
18633  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
18634  // both SSE and AVX2 since there is no sign-extended shift right
18635  // operation on a vector with 64-bit elements.
18636  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
18637  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
18638  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
18639      N0.getOpcode() == ISD::SIGN_EXTEND)) {
18640    SDValue N00 = N0.getOperand(0);
18641
18642    // EXTLOAD has a better solution on AVX2,
18643    // it may be replaced with X86ISD::VSEXT node.
18644    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
18645      if (!ISD::isNormalLoad(N00.getNode()))
18646        return SDValue();
18647
18648    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
18649        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
18650                                  N00, N1);
18651      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
18652    }
18653  }
18654  return SDValue();
18655}
18656
18657static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
18658                                  TargetLowering::DAGCombinerInfo &DCI,
18659                                  const X86Subtarget *Subtarget) {
18660  if (!DCI.isBeforeLegalizeOps())
18661    return SDValue();
18662
18663  if (!Subtarget->hasFp256())
18664    return SDValue();
18665
18666  EVT VT = N->getValueType(0);
18667  if (VT.isVector() && VT.getSizeInBits() == 256) {
18668    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
18669    if (R.getNode())
18670      return R;
18671  }
18672
18673  return SDValue();
18674}
18675
18676static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
18677                                 const X86Subtarget* Subtarget) {
18678  SDLoc dl(N);
18679  EVT VT = N->getValueType(0);
18680
18681  // Let legalize expand this if it isn't a legal type yet.
18682  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18683    return SDValue();
18684
18685  EVT ScalarVT = VT.getScalarType();
18686  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
18687      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
18688    return SDValue();
18689
18690  SDValue A = N->getOperand(0);
18691  SDValue B = N->getOperand(1);
18692  SDValue C = N->getOperand(2);
18693
18694  bool NegA = (A.getOpcode() == ISD::FNEG);
18695  bool NegB = (B.getOpcode() == ISD::FNEG);
18696  bool NegC = (C.getOpcode() == ISD::FNEG);
18697
18698  // Negative multiplication when NegA xor NegB
18699  bool NegMul = (NegA != NegB);
18700  if (NegA)
18701    A = A.getOperand(0);
18702  if (NegB)
18703    B = B.getOperand(0);
18704  if (NegC)
18705    C = C.getOperand(0);
18706
18707  unsigned Opcode;
18708  if (!NegMul)
18709    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
18710  else
18711    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
18712
18713  return DAG.getNode(Opcode, dl, VT, A, B, C);
18714}
18715
18716static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
18717                                  TargetLowering::DAGCombinerInfo &DCI,
18718                                  const X86Subtarget *Subtarget) {
18719  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
18720  //           (and (i32 x86isd::setcc_carry), 1)
18721  // This eliminates the zext. This transformation is necessary because
18722  // ISD::SETCC is always legalized to i8.
18723  SDLoc dl(N);
18724  SDValue N0 = N->getOperand(0);
18725  EVT VT = N->getValueType(0);
18726
18727  if (N0.getOpcode() == ISD::AND &&
18728      N0.hasOneUse() &&
18729      N0.getOperand(0).hasOneUse()) {
18730    SDValue N00 = N0.getOperand(0);
18731    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
18732      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
18733      if (!C || C->getZExtValue() != 1)
18734        return SDValue();
18735      return DAG.getNode(ISD::AND, dl, VT,
18736                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
18737                                     N00.getOperand(0), N00.getOperand(1)),
18738                         DAG.getConstant(1, VT));
18739    }
18740  }
18741
18742  if (VT.is256BitVector()) {
18743    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
18744    if (R.getNode())
18745      return R;
18746  }
18747
18748  return SDValue();
18749}
18750
18751// Optimize x == -y --> x+y == 0
18752//          x != -y --> x+y != 0
18753static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
18754  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
18755  SDValue LHS = N->getOperand(0);
18756  SDValue RHS = N->getOperand(1);
18757
18758  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
18759    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
18760      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
18761        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
18762                                   LHS.getValueType(), RHS, LHS.getOperand(1));
18763        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
18764                            addV, DAG.getConstant(0, addV.getValueType()), CC);
18765      }
18766  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
18767    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
18768      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
18769        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
18770                                   RHS.getValueType(), LHS, RHS.getOperand(1));
18771        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
18772                            addV, DAG.getConstant(0, addV.getValueType()), CC);
18773      }
18774  return SDValue();
18775}
18776
18777// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
18778// as "sbb reg,reg", since it can be extended without zext and produces
18779// an all-ones bit which is more useful than 0/1 in some cases.
18780static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
18781  return DAG.getNode(ISD::AND, DL, MVT::i8,
18782                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
18783                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
18784                     DAG.getConstant(1, MVT::i8));
18785}
18786
18787// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
18788static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
18789                                   TargetLowering::DAGCombinerInfo &DCI,
18790                                   const X86Subtarget *Subtarget) {
18791  SDLoc DL(N);
18792  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
18793  SDValue EFLAGS = N->getOperand(1);
18794
18795  if (CC == X86::COND_A) {
18796    // Try to convert COND_A into COND_B in an attempt to facilitate
18797    // materializing "setb reg".
18798    //
18799    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
18800    // cannot take an immediate as its first operand.
18801    //
18802    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
18803        EFLAGS.getValueType().isInteger() &&
18804        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
18805      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
18806                                   EFLAGS.getNode()->getVTList(),
18807                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
18808      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
18809      return MaterializeSETB(DL, NewEFLAGS, DAG);
18810    }
18811  }
18812
18813  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
18814  // a zext and produces an all-ones bit which is more useful than 0/1 in some
18815  // cases.
18816  if (CC == X86::COND_B)
18817    return MaterializeSETB(DL, EFLAGS, DAG);
18818
18819  SDValue Flags;
18820
18821  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
18822  if (Flags.getNode()) {
18823    SDValue Cond = DAG.getConstant(CC, MVT::i8);
18824    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
18825  }
18826
18827  return SDValue();
18828}
18829
18830// Optimize branch condition evaluation.
18831//
18832static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
18833                                    TargetLowering::DAGCombinerInfo &DCI,
18834                                    const X86Subtarget *Subtarget) {
18835  SDLoc DL(N);
18836  SDValue Chain = N->getOperand(0);
18837  SDValue Dest = N->getOperand(1);
18838  SDValue EFLAGS = N->getOperand(3);
18839  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
18840
18841  SDValue Flags;
18842
18843  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
18844  if (Flags.getNode()) {
18845    SDValue Cond = DAG.getConstant(CC, MVT::i8);
18846    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
18847                       Flags);
18848  }
18849
18850  return SDValue();
18851}
18852
18853static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
18854                                        const X86TargetLowering *XTLI) {
18855  SDValue Op0 = N->getOperand(0);
18856  EVT InVT = Op0->getValueType(0);
18857
18858  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
18859  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
18860    SDLoc dl(N);
18861    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
18862    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
18863    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
18864  }
18865
18866  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
18867  // a 32-bit target where SSE doesn't support i64->FP operations.
18868  if (Op0.getOpcode() == ISD::LOAD) {
18869    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
18870    EVT VT = Ld->getValueType(0);
18871    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
18872        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
18873        !XTLI->getSubtarget()->is64Bit() &&
18874        VT == MVT::i64) {
18875      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
18876                                          Ld->getChain(), Op0, DAG);
18877      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
18878      return FILDChain;
18879    }
18880  }
18881  return SDValue();
18882}
18883
18884// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
18885static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
18886                                 X86TargetLowering::DAGCombinerInfo &DCI) {
18887  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
18888  // the result is either zero or one (depending on the input carry bit).
18889  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
18890  if (X86::isZeroNode(N->getOperand(0)) &&
18891      X86::isZeroNode(N->getOperand(1)) &&
18892      // We don't have a good way to replace an EFLAGS use, so only do this when
18893      // dead right now.
18894      SDValue(N, 1).use_empty()) {
18895    SDLoc DL(N);
18896    EVT VT = N->getValueType(0);
18897    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
18898    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
18899                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
18900                                           DAG.getConstant(X86::COND_B,MVT::i8),
18901                                           N->getOperand(2)),
18902                               DAG.getConstant(1, VT));
18903    return DCI.CombineTo(N, Res1, CarryOut);
18904  }
18905
18906  return SDValue();
18907}
18908
18909// fold (add Y, (sete  X, 0)) -> adc  0, Y
18910//      (add Y, (setne X, 0)) -> sbb -1, Y
18911//      (sub (sete  X, 0), Y) -> sbb  0, Y
18912//      (sub (setne X, 0), Y) -> adc -1, Y
18913static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
18914  SDLoc DL(N);
18915
18916  // Look through ZExts.
18917  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
18918  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
18919    return SDValue();
18920
18921  SDValue SetCC = Ext.getOperand(0);
18922  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
18923    return SDValue();
18924
18925  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
18926  if (CC != X86::COND_E && CC != X86::COND_NE)
18927    return SDValue();
18928
18929  SDValue Cmp = SetCC.getOperand(1);
18930  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
18931      !X86::isZeroNode(Cmp.getOperand(1)) ||
18932      !Cmp.getOperand(0).getValueType().isInteger())
18933    return SDValue();
18934
18935  SDValue CmpOp0 = Cmp.getOperand(0);
18936  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
18937                               DAG.getConstant(1, CmpOp0.getValueType()));
18938
18939  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
18940  if (CC == X86::COND_NE)
18941    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
18942                       DL, OtherVal.getValueType(), OtherVal,
18943                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
18944  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
18945                     DL, OtherVal.getValueType(), OtherVal,
18946                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
18947}
18948
18949/// PerformADDCombine - Do target-specific dag combines on integer adds.
18950static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
18951                                 const X86Subtarget *Subtarget) {
18952  EVT VT = N->getValueType(0);
18953  SDValue Op0 = N->getOperand(0);
18954  SDValue Op1 = N->getOperand(1);
18955
18956  // Try to synthesize horizontal adds from adds of shuffles.
18957  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
18958       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
18959      isHorizontalBinOp(Op0, Op1, true))
18960    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
18961
18962  return OptimizeConditionalInDecrement(N, DAG);
18963}
18964
18965static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
18966                                 const X86Subtarget *Subtarget) {
18967  SDValue Op0 = N->getOperand(0);
18968  SDValue Op1 = N->getOperand(1);
18969
18970  // X86 can't encode an immediate LHS of a sub. See if we can push the
18971  // negation into a preceding instruction.
18972  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
18973    // If the RHS of the sub is a XOR with one use and a constant, invert the
18974    // immediate. Then add one to the LHS of the sub so we can turn
18975    // X-Y -> X+~Y+1, saving one register.
18976    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
18977        isa<ConstantSDNode>(Op1.getOperand(1))) {
18978      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
18979      EVT VT = Op0.getValueType();
18980      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
18981                                   Op1.getOperand(0),
18982                                   DAG.getConstant(~XorC, VT));
18983      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
18984                         DAG.getConstant(C->getAPIntValue()+1, VT));
18985    }
18986  }
18987
18988  // Try to synthesize horizontal adds from adds of shuffles.
18989  EVT VT = N->getValueType(0);
18990  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
18991       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
18992      isHorizontalBinOp(Op0, Op1, true))
18993    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
18994
18995  return OptimizeConditionalInDecrement(N, DAG);
18996}
18997
18998/// performVZEXTCombine - Performs build vector combines
18999static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
19000                                        TargetLowering::DAGCombinerInfo &DCI,
19001                                        const X86Subtarget *Subtarget) {
19002  // (vzext (bitcast (vzext (x)) -> (vzext x)
19003  SDValue In = N->getOperand(0);
19004  while (In.getOpcode() == ISD::BITCAST)
19005    In = In.getOperand(0);
19006
19007  if (In.getOpcode() != X86ISD::VZEXT)
19008    return SDValue();
19009
19010  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
19011                     In.getOperand(0));
19012}
19013
19014SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
19015                                             DAGCombinerInfo &DCI) const {
19016  SelectionDAG &DAG = DCI.DAG;
19017  switch (N->getOpcode()) {
19018  default: break;
19019  case ISD::EXTRACT_VECTOR_ELT:
19020    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
19021  case ISD::VSELECT:
19022  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
19023  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
19024  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
19025  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
19026  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
19027  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
19028  case ISD::SHL:
19029  case ISD::SRA:
19030  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
19031  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
19032  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
19033  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
19034  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
19035  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
19036  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
19037  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
19038  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
19039  case X86ISD::FXOR:
19040  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
19041  case X86ISD::FMIN:
19042  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
19043  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
19044  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
19045  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
19046  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
19047  case ISD::ANY_EXTEND:
19048  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
19049  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
19050  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
19051  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
19052  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
19053  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
19054  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
19055  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
19056  case X86ISD::SHUFP:       // Handle all target specific shuffles
19057  case X86ISD::PALIGNR:
19058  case X86ISD::UNPCKH:
19059  case X86ISD::UNPCKL:
19060  case X86ISD::MOVHLPS:
19061  case X86ISD::MOVLHPS:
19062  case X86ISD::PSHUFD:
19063  case X86ISD::PSHUFHW:
19064  case X86ISD::PSHUFLW:
19065  case X86ISD::MOVSS:
19066  case X86ISD::MOVSD:
19067  case X86ISD::VPERMILP:
19068  case X86ISD::VPERM2X128:
19069  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
19070  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
19071  }
19072
19073  return SDValue();
19074}
19075
19076/// isTypeDesirableForOp - Return true if the target has native support for
19077/// the specified value type and it is 'desirable' to use the type for the
19078/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
19079/// instruction encodings are longer and some i16 instructions are slow.
19080bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
19081  if (!isTypeLegal(VT))
19082    return false;
19083  if (VT != MVT::i16)
19084    return true;
19085
19086  switch (Opc) {
19087  default:
19088    return true;
19089  case ISD::LOAD:
19090  case ISD::SIGN_EXTEND:
19091  case ISD::ZERO_EXTEND:
19092  case ISD::ANY_EXTEND:
19093  case ISD::SHL:
19094  case ISD::SRL:
19095  case ISD::SUB:
19096  case ISD::ADD:
19097  case ISD::MUL:
19098  case ISD::AND:
19099  case ISD::OR:
19100  case ISD::XOR:
19101    return false;
19102  }
19103}
19104
19105/// IsDesirableToPromoteOp - This method query the target whether it is
19106/// beneficial for dag combiner to promote the specified node. If true, it
19107/// should return the desired promotion type by reference.
19108bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
19109  EVT VT = Op.getValueType();
19110  if (VT != MVT::i16)
19111    return false;
19112
19113  bool Promote = false;
19114  bool Commute = false;
19115  switch (Op.getOpcode()) {
19116  default: break;
19117  case ISD::LOAD: {
19118    LoadSDNode *LD = cast<LoadSDNode>(Op);
19119    // If the non-extending load has a single use and it's not live out, then it
19120    // might be folded.
19121    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
19122                                                     Op.hasOneUse()*/) {
19123      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
19124             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
19125        // The only case where we'd want to promote LOAD (rather then it being
19126        // promoted as an operand is when it's only use is liveout.
19127        if (UI->getOpcode() != ISD::CopyToReg)
19128          return false;
19129      }
19130    }
19131    Promote = true;
19132    break;
19133  }
19134  case ISD::SIGN_EXTEND:
19135  case ISD::ZERO_EXTEND:
19136  case ISD::ANY_EXTEND:
19137    Promote = true;
19138    break;
19139  case ISD::SHL:
19140  case ISD::SRL: {
19141    SDValue N0 = Op.getOperand(0);
19142    // Look out for (store (shl (load), x)).
19143    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
19144      return false;
19145    Promote = true;
19146    break;
19147  }
19148  case ISD::ADD:
19149  case ISD::MUL:
19150  case ISD::AND:
19151  case ISD::OR:
19152  case ISD::XOR:
19153    Commute = true;
19154    // fallthrough
19155  case ISD::SUB: {
19156    SDValue N0 = Op.getOperand(0);
19157    SDValue N1 = Op.getOperand(1);
19158    if (!Commute && MayFoldLoad(N1))
19159      return false;
19160    // Avoid disabling potential load folding opportunities.
19161    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
19162      return false;
19163    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
19164      return false;
19165    Promote = true;
19166  }
19167  }
19168
19169  PVT = MVT::i32;
19170  return Promote;
19171}
19172
19173//===----------------------------------------------------------------------===//
19174//                           X86 Inline Assembly Support
19175//===----------------------------------------------------------------------===//
19176
19177namespace {
19178  // Helper to match a string separated by whitespace.
19179  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
19180    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
19181
19182    for (unsigned i = 0, e = args.size(); i != e; ++i) {
19183      StringRef piece(*args[i]);
19184      if (!s.startswith(piece)) // Check if the piece matches.
19185        return false;
19186
19187      s = s.substr(piece.size());
19188      StringRef::size_type pos = s.find_first_not_of(" \t");
19189      if (pos == 0) // We matched a prefix.
19190        return false;
19191
19192      s = s.substr(pos);
19193    }
19194
19195    return s.empty();
19196  }
19197  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
19198}
19199
19200static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
19201
19202  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
19203    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
19204        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
19205        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
19206
19207      if (AsmPieces.size() == 3)
19208        return true;
19209      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
19210        return true;
19211    }
19212  }
19213  return false;
19214}
19215
19216bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
19217  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
19218
19219  std::string AsmStr = IA->getAsmString();
19220
19221  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
19222  if (!Ty || Ty->getBitWidth() % 16 != 0)
19223    return false;
19224
19225  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
19226  SmallVector<StringRef, 4> AsmPieces;
19227  SplitString(AsmStr, AsmPieces, ";\n");
19228
19229  switch (AsmPieces.size()) {
19230  default: return false;
19231  case 1:
19232    // FIXME: this should verify that we are targeting a 486 or better.  If not,
19233    // we will turn this bswap into something that will be lowered to logical
19234    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
19235    // lower so don't worry about this.
19236    // bswap $0
19237    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
19238        matchAsm(AsmPieces[0], "bswapl", "$0") ||
19239        matchAsm(AsmPieces[0], "bswapq", "$0") ||
19240        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
19241        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
19242        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
19243      // No need to check constraints, nothing other than the equivalent of
19244      // "=r,0" would be valid here.
19245      return IntrinsicLowering::LowerToByteSwap(CI);
19246    }
19247
19248    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
19249    if (CI->getType()->isIntegerTy(16) &&
19250        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
19251        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
19252         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
19253      AsmPieces.clear();
19254      const std::string &ConstraintsStr = IA->getConstraintString();
19255      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
19256      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
19257      if (clobbersFlagRegisters(AsmPieces))
19258        return IntrinsicLowering::LowerToByteSwap(CI);
19259    }
19260    break;
19261  case 3:
19262    if (CI->getType()->isIntegerTy(32) &&
19263        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
19264        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
19265        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
19266        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
19267      AsmPieces.clear();
19268      const std::string &ConstraintsStr = IA->getConstraintString();
19269      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
19270      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
19271      if (clobbersFlagRegisters(AsmPieces))
19272        return IntrinsicLowering::LowerToByteSwap(CI);
19273    }
19274
19275    if (CI->getType()->isIntegerTy(64)) {
19276      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
19277      if (Constraints.size() >= 2 &&
19278          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
19279          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
19280        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
19281        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
19282            matchAsm(AsmPieces[1], "bswap", "%edx") &&
19283            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
19284          return IntrinsicLowering::LowerToByteSwap(CI);
19285      }
19286    }
19287    break;
19288  }
19289  return false;
19290}
19291
19292/// getConstraintType - Given a constraint letter, return the type of
19293/// constraint it is for this target.
19294X86TargetLowering::ConstraintType
19295X86TargetLowering::getConstraintType(const std::string &Constraint) const {
19296  if (Constraint.size() == 1) {
19297    switch (Constraint[0]) {
19298    case 'R':
19299    case 'q':
19300    case 'Q':
19301    case 'f':
19302    case 't':
19303    case 'u':
19304    case 'y':
19305    case 'x':
19306    case 'Y':
19307    case 'l':
19308      return C_RegisterClass;
19309    case 'a':
19310    case 'b':
19311    case 'c':
19312    case 'd':
19313    case 'S':
19314    case 'D':
19315    case 'A':
19316      return C_Register;
19317    case 'I':
19318    case 'J':
19319    case 'K':
19320    case 'L':
19321    case 'M':
19322    case 'N':
19323    case 'G':
19324    case 'C':
19325    case 'e':
19326    case 'Z':
19327      return C_Other;
19328    default:
19329      break;
19330    }
19331  }
19332  return TargetLowering::getConstraintType(Constraint);
19333}
19334
19335/// Examine constraint type and operand type and determine a weight value.
19336/// This object must already have been set up with the operand type
19337/// and the current alternative constraint selected.
19338TargetLowering::ConstraintWeight
19339  X86TargetLowering::getSingleConstraintMatchWeight(
19340    AsmOperandInfo &info, const char *constraint) const {
19341  ConstraintWeight weight = CW_Invalid;
19342  Value *CallOperandVal = info.CallOperandVal;
19343    // If we don't have a value, we can't do a match,
19344    // but allow it at the lowest weight.
19345  if (CallOperandVal == NULL)
19346    return CW_Default;
19347  Type *type = CallOperandVal->getType();
19348  // Look at the constraint type.
19349  switch (*constraint) {
19350  default:
19351    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
19352  case 'R':
19353  case 'q':
19354  case 'Q':
19355  case 'a':
19356  case 'b':
19357  case 'c':
19358  case 'd':
19359  case 'S':
19360  case 'D':
19361  case 'A':
19362    if (CallOperandVal->getType()->isIntegerTy())
19363      weight = CW_SpecificReg;
19364    break;
19365  case 'f':
19366  case 't':
19367  case 'u':
19368    if (type->isFloatingPointTy())
19369      weight = CW_SpecificReg;
19370    break;
19371  case 'y':
19372    if (type->isX86_MMXTy() && Subtarget->hasMMX())
19373      weight = CW_SpecificReg;
19374    break;
19375  case 'x':
19376  case 'Y':
19377    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
19378        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
19379      weight = CW_Register;
19380    break;
19381  case 'I':
19382    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
19383      if (C->getZExtValue() <= 31)
19384        weight = CW_Constant;
19385    }
19386    break;
19387  case 'J':
19388    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19389      if (C->getZExtValue() <= 63)
19390        weight = CW_Constant;
19391    }
19392    break;
19393  case 'K':
19394    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19395      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
19396        weight = CW_Constant;
19397    }
19398    break;
19399  case 'L':
19400    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19401      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
19402        weight = CW_Constant;
19403    }
19404    break;
19405  case 'M':
19406    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19407      if (C->getZExtValue() <= 3)
19408        weight = CW_Constant;
19409    }
19410    break;
19411  case 'N':
19412    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19413      if (C->getZExtValue() <= 0xff)
19414        weight = CW_Constant;
19415    }
19416    break;
19417  case 'G':
19418  case 'C':
19419    if (dyn_cast<ConstantFP>(CallOperandVal)) {
19420      weight = CW_Constant;
19421    }
19422    break;
19423  case 'e':
19424    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19425      if ((C->getSExtValue() >= -0x80000000LL) &&
19426          (C->getSExtValue() <= 0x7fffffffLL))
19427        weight = CW_Constant;
19428    }
19429    break;
19430  case 'Z':
19431    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
19432      if (C->getZExtValue() <= 0xffffffff)
19433        weight = CW_Constant;
19434    }
19435    break;
19436  }
19437  return weight;
19438}
19439
19440/// LowerXConstraint - try to replace an X constraint, which matches anything,
19441/// with another that has more specific requirements based on the type of the
19442/// corresponding operand.
19443const char *X86TargetLowering::
19444LowerXConstraint(EVT ConstraintVT) const {
19445  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
19446  // 'f' like normal targets.
19447  if (ConstraintVT.isFloatingPoint()) {
19448    if (Subtarget->hasSSE2())
19449      return "Y";
19450    if (Subtarget->hasSSE1())
19451      return "x";
19452  }
19453
19454  return TargetLowering::LowerXConstraint(ConstraintVT);
19455}
19456
19457/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
19458/// vector.  If it is invalid, don't add anything to Ops.
19459void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
19460                                                     std::string &Constraint,
19461                                                     std::vector<SDValue>&Ops,
19462                                                     SelectionDAG &DAG) const {
19463  SDValue Result(0, 0);
19464
19465  // Only support length 1 constraints for now.
19466  if (Constraint.length() > 1) return;
19467
19468  char ConstraintLetter = Constraint[0];
19469  switch (ConstraintLetter) {
19470  default: break;
19471  case 'I':
19472    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19473      if (C->getZExtValue() <= 31) {
19474        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19475        break;
19476      }
19477    }
19478    return;
19479  case 'J':
19480    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19481      if (C->getZExtValue() <= 63) {
19482        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19483        break;
19484      }
19485    }
19486    return;
19487  case 'K':
19488    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19489      if (isInt<8>(C->getSExtValue())) {
19490        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19491        break;
19492      }
19493    }
19494    return;
19495  case 'N':
19496    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19497      if (C->getZExtValue() <= 255) {
19498        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19499        break;
19500      }
19501    }
19502    return;
19503  case 'e': {
19504    // 32-bit signed value
19505    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19506      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
19507                                           C->getSExtValue())) {
19508        // Widen to 64 bits here to get it sign extended.
19509        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
19510        break;
19511      }
19512    // FIXME gcc accepts some relocatable values here too, but only in certain
19513    // memory models; it's complicated.
19514    }
19515    return;
19516  }
19517  case 'Z': {
19518    // 32-bit unsigned value
19519    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
19520      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
19521                                           C->getZExtValue())) {
19522        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
19523        break;
19524      }
19525    }
19526    // FIXME gcc accepts some relocatable values here too, but only in certain
19527    // memory models; it's complicated.
19528    return;
19529  }
19530  case 'i': {
19531    // Literal immediates are always ok.
19532    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
19533      // Widen to 64 bits here to get it sign extended.
19534      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
19535      break;
19536    }
19537
19538    // In any sort of PIC mode addresses need to be computed at runtime by
19539    // adding in a register or some sort of table lookup.  These can't
19540    // be used as immediates.
19541    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
19542      return;
19543
19544    // If we are in non-pic codegen mode, we allow the address of a global (with
19545    // an optional displacement) to be used with 'i'.
19546    GlobalAddressSDNode *GA = 0;
19547    int64_t Offset = 0;
19548
19549    // Match either (GA), (GA+C), (GA+C1+C2), etc.
19550    while (1) {
19551      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
19552        Offset += GA->getOffset();
19553        break;
19554      } else if (Op.getOpcode() == ISD::ADD) {
19555        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
19556          Offset += C->getZExtValue();
19557          Op = Op.getOperand(0);
19558          continue;
19559        }
19560      } else if (Op.getOpcode() == ISD::SUB) {
19561        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
19562          Offset += -C->getZExtValue();
19563          Op = Op.getOperand(0);
19564          continue;
19565        }
19566      }
19567
19568      // Otherwise, this isn't something we can handle, reject it.
19569      return;
19570    }
19571
19572    const GlobalValue *GV = GA->getGlobal();
19573    // If we require an extra load to get this address, as in PIC mode, we
19574    // can't accept it.
19575    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
19576                                                        getTargetMachine())))
19577      return;
19578
19579    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
19580                                        GA->getValueType(0), Offset);
19581    break;
19582  }
19583  }
19584
19585  if (Result.getNode()) {
19586    Ops.push_back(Result);
19587    return;
19588  }
19589  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
19590}
19591
19592std::pair<unsigned, const TargetRegisterClass*>
19593X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
19594                                                MVT VT) const {
19595  // First, see if this is a constraint that directly corresponds to an LLVM
19596  // register class.
19597  if (Constraint.size() == 1) {
19598    // GCC Constraint Letters
19599    switch (Constraint[0]) {
19600    default: break;
19601      // TODO: Slight differences here in allocation order and leaving
19602      // RIP in the class. Do they matter any more here than they do
19603      // in the normal allocation?
19604    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
19605      if (Subtarget->is64Bit()) {
19606        if (VT == MVT::i32 || VT == MVT::f32)
19607          return std::make_pair(0U, &X86::GR32RegClass);
19608        if (VT == MVT::i16)
19609          return std::make_pair(0U, &X86::GR16RegClass);
19610        if (VT == MVT::i8 || VT == MVT::i1)
19611          return std::make_pair(0U, &X86::GR8RegClass);
19612        if (VT == MVT::i64 || VT == MVT::f64)
19613          return std::make_pair(0U, &X86::GR64RegClass);
19614        break;
19615      }
19616      // 32-bit fallthrough
19617    case 'Q':   // Q_REGS
19618      if (VT == MVT::i32 || VT == MVT::f32)
19619        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
19620      if (VT == MVT::i16)
19621        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
19622      if (VT == MVT::i8 || VT == MVT::i1)
19623        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
19624      if (VT == MVT::i64)
19625        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
19626      break;
19627    case 'r':   // GENERAL_REGS
19628    case 'l':   // INDEX_REGS
19629      if (VT == MVT::i8 || VT == MVT::i1)
19630        return std::make_pair(0U, &X86::GR8RegClass);
19631      if (VT == MVT::i16)
19632        return std::make_pair(0U, &X86::GR16RegClass);
19633      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
19634        return std::make_pair(0U, &X86::GR32RegClass);
19635      return std::make_pair(0U, &X86::GR64RegClass);
19636    case 'R':   // LEGACY_REGS
19637      if (VT == MVT::i8 || VT == MVT::i1)
19638        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
19639      if (VT == MVT::i16)
19640        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
19641      if (VT == MVT::i32 || !Subtarget->is64Bit())
19642        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
19643      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
19644    case 'f':  // FP Stack registers.
19645      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
19646      // value to the correct fpstack register class.
19647      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
19648        return std::make_pair(0U, &X86::RFP32RegClass);
19649      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
19650        return std::make_pair(0U, &X86::RFP64RegClass);
19651      return std::make_pair(0U, &X86::RFP80RegClass);
19652    case 'y':   // MMX_REGS if MMX allowed.
19653      if (!Subtarget->hasMMX()) break;
19654      return std::make_pair(0U, &X86::VR64RegClass);
19655    case 'Y':   // SSE_REGS if SSE2 allowed
19656      if (!Subtarget->hasSSE2()) break;
19657      // FALL THROUGH.
19658    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
19659      if (!Subtarget->hasSSE1()) break;
19660
19661      switch (VT.SimpleTy) {
19662      default: break;
19663      // Scalar SSE types.
19664      case MVT::f32:
19665      case MVT::i32:
19666        return std::make_pair(0U, &X86::FR32RegClass);
19667      case MVT::f64:
19668      case MVT::i64:
19669        return std::make_pair(0U, &X86::FR64RegClass);
19670      // Vector types.
19671      case MVT::v16i8:
19672      case MVT::v8i16:
19673      case MVT::v4i32:
19674      case MVT::v2i64:
19675      case MVT::v4f32:
19676      case MVT::v2f64:
19677        return std::make_pair(0U, &X86::VR128RegClass);
19678      // AVX types.
19679      case MVT::v32i8:
19680      case MVT::v16i16:
19681      case MVT::v8i32:
19682      case MVT::v4i64:
19683      case MVT::v8f32:
19684      case MVT::v4f64:
19685        return std::make_pair(0U, &X86::VR256RegClass);
19686      case MVT::v8f64:
19687      case MVT::v16f32:
19688      case MVT::v16i32:
19689      case MVT::v8i64:
19690        return std::make_pair(0U, &X86::VR512RegClass);
19691      }
19692      break;
19693    }
19694  }
19695
19696  // Use the default implementation in TargetLowering to convert the register
19697  // constraint into a member of a register class.
19698  std::pair<unsigned, const TargetRegisterClass*> Res;
19699  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
19700
19701  // Not found as a standard register?
19702  if (Res.second == 0) {
19703    // Map st(0) -> st(7) -> ST0
19704    if (Constraint.size() == 7 && Constraint[0] == '{' &&
19705        tolower(Constraint[1]) == 's' &&
19706        tolower(Constraint[2]) == 't' &&
19707        Constraint[3] == '(' &&
19708        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
19709        Constraint[5] == ')' &&
19710        Constraint[6] == '}') {
19711
19712      Res.first = X86::ST0+Constraint[4]-'0';
19713      Res.second = &X86::RFP80RegClass;
19714      return Res;
19715    }
19716
19717    // GCC allows "st(0)" to be called just plain "st".
19718    if (StringRef("{st}").equals_lower(Constraint)) {
19719      Res.first = X86::ST0;
19720      Res.second = &X86::RFP80RegClass;
19721      return Res;
19722    }
19723
19724    // flags -> EFLAGS
19725    if (StringRef("{flags}").equals_lower(Constraint)) {
19726      Res.first = X86::EFLAGS;
19727      Res.second = &X86::CCRRegClass;
19728      return Res;
19729    }
19730
19731    // 'A' means EAX + EDX.
19732    if (Constraint == "A") {
19733      Res.first = X86::EAX;
19734      Res.second = &X86::GR32_ADRegClass;
19735      return Res;
19736    }
19737    return Res;
19738  }
19739
19740  // Otherwise, check to see if this is a register class of the wrong value
19741  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
19742  // turn into {ax},{dx}.
19743  if (Res.second->hasType(VT))
19744    return Res;   // Correct type already, nothing to do.
19745
19746  // All of the single-register GCC register classes map their values onto
19747  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
19748  // really want an 8-bit or 32-bit register, map to the appropriate register
19749  // class and return the appropriate register.
19750  if (Res.second == &X86::GR16RegClass) {
19751    if (VT == MVT::i8 || VT == MVT::i1) {
19752      unsigned DestReg = 0;
19753      switch (Res.first) {
19754      default: break;
19755      case X86::AX: DestReg = X86::AL; break;
19756      case X86::DX: DestReg = X86::DL; break;
19757      case X86::CX: DestReg = X86::CL; break;
19758      case X86::BX: DestReg = X86::BL; break;
19759      }
19760      if (DestReg) {
19761        Res.first = DestReg;
19762        Res.second = &X86::GR8RegClass;
19763      }
19764    } else if (VT == MVT::i32 || VT == MVT::f32) {
19765      unsigned DestReg = 0;
19766      switch (Res.first) {
19767      default: break;
19768      case X86::AX: DestReg = X86::EAX; break;
19769      case X86::DX: DestReg = X86::EDX; break;
19770      case X86::CX: DestReg = X86::ECX; break;
19771      case X86::BX: DestReg = X86::EBX; break;
19772      case X86::SI: DestReg = X86::ESI; break;
19773      case X86::DI: DestReg = X86::EDI; break;
19774      case X86::BP: DestReg = X86::EBP; break;
19775      case X86::SP: DestReg = X86::ESP; break;
19776      }
19777      if (DestReg) {
19778        Res.first = DestReg;
19779        Res.second = &X86::GR32RegClass;
19780      }
19781    } else if (VT == MVT::i64 || VT == MVT::f64) {
19782      unsigned DestReg = 0;
19783      switch (Res.first) {
19784      default: break;
19785      case X86::AX: DestReg = X86::RAX; break;
19786      case X86::DX: DestReg = X86::RDX; break;
19787      case X86::CX: DestReg = X86::RCX; break;
19788      case X86::BX: DestReg = X86::RBX; break;
19789      case X86::SI: DestReg = X86::RSI; break;
19790      case X86::DI: DestReg = X86::RDI; break;
19791      case X86::BP: DestReg = X86::RBP; break;
19792      case X86::SP: DestReg = X86::RSP; break;
19793      }
19794      if (DestReg) {
19795        Res.first = DestReg;
19796        Res.second = &X86::GR64RegClass;
19797      }
19798    }
19799  } else if (Res.second == &X86::FR32RegClass ||
19800             Res.second == &X86::FR64RegClass ||
19801             Res.second == &X86::VR128RegClass ||
19802             Res.second == &X86::VR256RegClass ||
19803             Res.second == &X86::FR32XRegClass ||
19804             Res.second == &X86::FR64XRegClass ||
19805             Res.second == &X86::VR128XRegClass ||
19806             Res.second == &X86::VR256XRegClass ||
19807             Res.second == &X86::VR512RegClass) {
19808    // Handle references to XMM physical registers that got mapped into the
19809    // wrong class.  This can happen with constraints like {xmm0} where the
19810    // target independent register mapper will just pick the first match it can
19811    // find, ignoring the required type.
19812
19813    if (VT == MVT::f32 || VT == MVT::i32)
19814      Res.second = &X86::FR32RegClass;
19815    else if (VT == MVT::f64 || VT == MVT::i64)
19816      Res.second = &X86::FR64RegClass;
19817    else if (X86::VR128RegClass.hasType(VT))
19818      Res.second = &X86::VR128RegClass;
19819    else if (X86::VR256RegClass.hasType(VT))
19820      Res.second = &X86::VR256RegClass;
19821    else if (X86::VR512RegClass.hasType(VT))
19822      Res.second = &X86::VR512RegClass;
19823  }
19824
19825  return Res;
19826}
19827