1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
16#include "Utils/X86ShuffleDecode.h"
17#include "X86CallingConv.h"
18#include "X86InstrBuilder.h"
19#include "X86MachineFunctionInfo.h"
20#include "X86TargetMachine.h"
21#include "X86TargetObjectFile.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/Statistic.h"
24#include "llvm/ADT/StringExtras.h"
25#include "llvm/ADT/StringSwitch.h"
26#include "llvm/ADT/VariadicFunction.h"
27#include "llvm/CodeGen/IntrinsicLowering.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineJumpTableInfo.h"
32#include "llvm/CodeGen/MachineModuleInfo.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/IR/CallSite.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DerivedTypes.h"
38#include "llvm/IR/Function.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/GlobalVariable.h"
41#include "llvm/IR/Instructions.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/MC/MCAsmInfo.h"
44#include "llvm/MC/MCContext.h"
45#include "llvm/MC/MCExpr.h"
46#include "llvm/MC/MCSymbol.h"
47#include "llvm/Support/CommandLine.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/MathExtras.h"
51#include "llvm/Target/TargetOptions.h"
52#include <bitset>
53#include <numeric>
54#include <cctype>
55using namespace llvm;
56
57#define DEBUG_TYPE "x86-isel"
58
59STATISTIC(NumTailCalls, "Number of tail calls");
60
61static cl::opt<bool> ExperimentalVectorWideningLegalization(
62    "x86-experimental-vector-widening-legalization", cl::init(false),
63    cl::desc("Enable an experimental vector type legalization through widening "
64             "rather than promotion."),
65    cl::Hidden);
66
67static cl::opt<bool> ExperimentalVectorShuffleLowering(
68    "x86-experimental-vector-shuffle-lowering", cl::init(false),
69    cl::desc("Enable an experimental vector shuffle lowering code path."),
70    cl::Hidden);
71
72// Forward declarations.
73static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
74                       SDValue V2);
75
76static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
77                                SelectionDAG &DAG, SDLoc dl,
78                                unsigned vectorWidth) {
79  assert((vectorWidth == 128 || vectorWidth == 256) &&
80         "Unsupported vector width");
81  EVT VT = Vec.getValueType();
82  EVT ElVT = VT.getVectorElementType();
83  unsigned Factor = VT.getSizeInBits()/vectorWidth;
84  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
85                                  VT.getVectorNumElements()/Factor);
86
87  // Extract from UNDEF is UNDEF.
88  if (Vec.getOpcode() == ISD::UNDEF)
89    return DAG.getUNDEF(ResultVT);
90
91  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
92  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
93
94  // This is the index of the first element of the vectorWidth-bit chunk
95  // we want.
96  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
97                               * ElemsPerChunk);
98
99  // If the input is a buildvector just emit a smaller one.
100  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
101    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
102                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
103                                    ElemsPerChunk));
104
105  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
106  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
107                               VecIdx);
108
109  return Result;
110
111}
112/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
113/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
114/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
115/// instructions or a simple subregister reference. Idx is an index in the
116/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
117/// lowering EXTRACT_VECTOR_ELT operations easier.
118static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
119                                   SelectionDAG &DAG, SDLoc dl) {
120  assert((Vec.getValueType().is256BitVector() ||
121          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
122  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
123}
124
125/// Generate a DAG to grab 256-bits from a 512-bit vector.
126static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
127                                   SelectionDAG &DAG, SDLoc dl) {
128  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
129  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
130}
131
132static SDValue InsertSubVector(SDValue Result, SDValue Vec,
133                               unsigned IdxVal, SelectionDAG &DAG,
134                               SDLoc dl, unsigned vectorWidth) {
135  assert((vectorWidth == 128 || vectorWidth == 256) &&
136         "Unsupported vector width");
137  // Inserting UNDEF is Result
138  if (Vec.getOpcode() == ISD::UNDEF)
139    return Result;
140  EVT VT = Vec.getValueType();
141  EVT ElVT = VT.getVectorElementType();
142  EVT ResultVT = Result.getValueType();
143
144  // Insert the relevant vectorWidth bits.
145  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
146
147  // This is the index of the first element of the vectorWidth-bit chunk
148  // we want.
149  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
150                               * ElemsPerChunk);
151
152  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
153  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
154                     VecIdx);
155}
156/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
157/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
158/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
159/// simple superregister reference.  Idx is an index in the 128 bits
160/// we want.  It need not be aligned to a 128-bit bounday.  That makes
161/// lowering INSERT_VECTOR_ELT operations easier.
162static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
163                                  unsigned IdxVal, SelectionDAG &DAG,
164                                  SDLoc dl) {
165  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
166  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
167}
168
169static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
170                                  unsigned IdxVal, SelectionDAG &DAG,
171                                  SDLoc dl) {
172  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
173  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
174}
175
176/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
177/// instructions. This is used because creating CONCAT_VECTOR nodes of
178/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
179/// large BUILD_VECTORS.
180static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
181                                   unsigned NumElems, SelectionDAG &DAG,
182                                   SDLoc dl) {
183  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
184  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
185}
186
187static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
188                                   unsigned NumElems, SelectionDAG &DAG,
189                                   SDLoc dl) {
190  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
191  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
192}
193
194static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
195  if (TT.isOSBinFormatMachO()) {
196    if (TT.getArch() == Triple::x86_64)
197      return new X86_64MachoTargetObjectFile();
198    return new TargetLoweringObjectFileMachO();
199  }
200
201  if (TT.isOSLinux())
202    return new X86LinuxTargetObjectFile();
203  if (TT.isOSBinFormatELF())
204    return new TargetLoweringObjectFileELF();
205  if (TT.isKnownWindowsMSVCEnvironment())
206    return new X86WindowsTargetObjectFile();
207  if (TT.isOSBinFormatCOFF())
208    return new TargetLoweringObjectFileCOFF();
209  llvm_unreachable("unknown subtarget type");
210}
211
212// FIXME: This should stop caching the target machine as soon as
213// we can remove resetOperationActions et al.
214X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
215  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
216  Subtarget = &TM.getSubtarget<X86Subtarget>();
217  X86ScalarSSEf64 = Subtarget->hasSSE2();
218  X86ScalarSSEf32 = Subtarget->hasSSE1();
219  TD = getDataLayout();
220
221  resetOperationActions();
222}
223
224void X86TargetLowering::resetOperationActions() {
225  const TargetMachine &TM = getTargetMachine();
226  static bool FirstTimeThrough = true;
227
228  // If none of the target options have changed, then we don't need to reset the
229  // operation actions.
230  if (!FirstTimeThrough && TO == TM.Options) return;
231
232  if (!FirstTimeThrough) {
233    // Reinitialize the actions.
234    initActions();
235    FirstTimeThrough = false;
236  }
237
238  TO = TM.Options;
239
240  // Set up the TargetLowering object.
241  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
242
243  // X86 is weird, it always uses i8 for shift amounts and setcc results.
244  setBooleanContents(ZeroOrOneBooleanContent);
245  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
246  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
247
248  // For 64-bit since we have so many registers use the ILP scheduler, for
249  // 32-bit code use the register pressure specific scheduling.
250  // For Atom, always use ILP scheduling.
251  if (Subtarget->isAtom())
252    setSchedulingPreference(Sched::ILP);
253  else if (Subtarget->is64Bit())
254    setSchedulingPreference(Sched::ILP);
255  else
256    setSchedulingPreference(Sched::RegPressure);
257  const X86RegisterInfo *RegInfo =
258    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
259  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
260
261  // Bypass expensive divides on Atom when compiling with O2
262  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
263    addBypassSlowDiv(32, 8);
264    if (Subtarget->is64Bit())
265      addBypassSlowDiv(64, 16);
266  }
267
268  if (Subtarget->isTargetKnownWindowsMSVC()) {
269    // Setup Windows compiler runtime calls.
270    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
271    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
272    setLibcallName(RTLIB::SREM_I64, "_allrem");
273    setLibcallName(RTLIB::UREM_I64, "_aullrem");
274    setLibcallName(RTLIB::MUL_I64, "_allmul");
275    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
276    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
277    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
278    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
279    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
280
281    // The _ftol2 runtime function has an unusual calling conv, which
282    // is modeled by a special pseudo-instruction.
283    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
284    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
285    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
286    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
287  }
288
289  if (Subtarget->isTargetDarwin()) {
290    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
291    setUseUnderscoreSetJmp(false);
292    setUseUnderscoreLongJmp(false);
293  } else if (Subtarget->isTargetWindowsGNU()) {
294    // MS runtime is weird: it exports _setjmp, but longjmp!
295    setUseUnderscoreSetJmp(true);
296    setUseUnderscoreLongJmp(false);
297  } else {
298    setUseUnderscoreSetJmp(true);
299    setUseUnderscoreLongJmp(true);
300  }
301
302  // Set up the register classes.
303  addRegisterClass(MVT::i8, &X86::GR8RegClass);
304  addRegisterClass(MVT::i16, &X86::GR16RegClass);
305  addRegisterClass(MVT::i32, &X86::GR32RegClass);
306  if (Subtarget->is64Bit())
307    addRegisterClass(MVT::i64, &X86::GR64RegClass);
308
309  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
310
311  // We don't accept any truncstore of integer registers.
312  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
313  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
314  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
315  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
316  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
317  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
318
319  // SETOEQ and SETUNE require checking two conditions.
320  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
321  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
322  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
323  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
324  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
325  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
326
327  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
328  // operation.
329  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
330  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
331  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
332
333  if (Subtarget->is64Bit()) {
334    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
335    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
336  } else if (!TM.Options.UseSoftFloat) {
337    // We have an algorithm for SSE2->double, and we turn this into a
338    // 64-bit FILD followed by conditional FADD for other targets.
339    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
340    // We have an algorithm for SSE2, and we turn this into a 64-bit
341    // FILD for other targets.
342    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
343  }
344
345  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
346  // this operation.
347  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
348  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
349
350  if (!TM.Options.UseSoftFloat) {
351    // SSE has no i16 to fp conversion, only i32
352    if (X86ScalarSSEf32) {
353      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
354      // f32 and f64 cases are Legal, f80 case is not
355      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
356    } else {
357      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
358      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
359    }
360  } else {
361    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
362    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
363  }
364
365  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
366  // are Legal, f80 is custom lowered.
367  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
368  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
369
370  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
371  // this operation.
372  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
373  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
374
375  if (X86ScalarSSEf32) {
376    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
377    // f32 and f64 cases are Legal, f80 case is not
378    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
379  } else {
380    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
381    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
382  }
383
384  // Handle FP_TO_UINT by promoting the destination to a larger signed
385  // conversion.
386  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
387  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
388  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
389
390  if (Subtarget->is64Bit()) {
391    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
392    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
393  } else if (!TM.Options.UseSoftFloat) {
394    // Since AVX is a superset of SSE3, only check for SSE here.
395    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
396      // Expand FP_TO_UINT into a select.
397      // FIXME: We would like to use a Custom expander here eventually to do
398      // the optimal thing for SSE vs. the default expansion in the legalizer.
399      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
400    else
401      // With SSE3 we can use fisttpll to convert to a signed i64; without
402      // SSE, we're stuck with a fistpll.
403      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
404  }
405
406  if (isTargetFTOL()) {
407    // Use the _ftol2 runtime function, which has a pseudo-instruction
408    // to handle its weird calling convention.
409    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
410  }
411
412  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
413  if (!X86ScalarSSEf64) {
414    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
415    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
416    if (Subtarget->is64Bit()) {
417      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
418      // Without SSE, i64->f64 goes through memory.
419      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
420    }
421  }
422
423  // Scalar integer divide and remainder are lowered to use operations that
424  // produce two results, to match the available instructions. This exposes
425  // the two-result form to trivial CSE, which is able to combine x/y and x%y
426  // into a single instruction.
427  //
428  // Scalar integer multiply-high is also lowered to use two-result
429  // operations, to match the available instructions. However, plain multiply
430  // (low) operations are left as Legal, as there are single-result
431  // instructions for this in x86. Using the two-result multiply instructions
432  // when both high and low results are needed must be arranged by dagcombine.
433  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
434    MVT VT = IntVTs[i];
435    setOperationAction(ISD::MULHS, VT, Expand);
436    setOperationAction(ISD::MULHU, VT, Expand);
437    setOperationAction(ISD::SDIV, VT, Expand);
438    setOperationAction(ISD::UDIV, VT, Expand);
439    setOperationAction(ISD::SREM, VT, Expand);
440    setOperationAction(ISD::UREM, VT, Expand);
441
442    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
443    setOperationAction(ISD::ADDC, VT, Custom);
444    setOperationAction(ISD::ADDE, VT, Custom);
445    setOperationAction(ISD::SUBC, VT, Custom);
446    setOperationAction(ISD::SUBE, VT, Custom);
447  }
448
449  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
450  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
451  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
452  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
453  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
454  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
455  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
456  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
457  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
458  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
459  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
460  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
461  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
462  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
463  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
464  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
465  if (Subtarget->is64Bit())
466    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
467  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
468  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
469  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
470  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
471  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
472  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
473  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
474  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
475
476  // Promote the i8 variants and force them on up to i32 which has a shorter
477  // encoding.
478  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
479  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
480  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
481  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
482  if (Subtarget->hasBMI()) {
483    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
484    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
485    if (Subtarget->is64Bit())
486      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
487  } else {
488    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
489    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
490    if (Subtarget->is64Bit())
491      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
492  }
493
494  if (Subtarget->hasLZCNT()) {
495    // When promoting the i8 variants, force them to i32 for a shorter
496    // encoding.
497    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
498    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
499    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
500    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
501    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
502    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
503    if (Subtarget->is64Bit())
504      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
505  } else {
506    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
507    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
508    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
509    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
510    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
511    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
512    if (Subtarget->is64Bit()) {
513      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
514      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
515    }
516  }
517
518  // Special handling for half-precision floating point conversions.
519  // If we don't have F16C support, then lower half float conversions
520  // into library calls.
521  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
522    setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
523    setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
524  }
525
526  if (Subtarget->hasPOPCNT()) {
527    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
528  } else {
529    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
530    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
531    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
532    if (Subtarget->is64Bit())
533      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
534  }
535
536  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
537
538  if (!Subtarget->hasMOVBE())
539    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
540
541  // These should be promoted to a larger select which is supported.
542  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
543  // X86 wants to expand cmov itself.
544  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
545  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
546  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
547  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
548  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
549  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
550  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
551  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
552  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
553  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
554  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
555  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
556  if (Subtarget->is64Bit()) {
557    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
558    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
559  }
560  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
561  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
562  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
563  // support continuation, user-level threading, and etc.. As a result, no
564  // other SjLj exception interfaces are implemented and please don't build
565  // your own exception handling based on them.
566  // LLVM/Clang supports zero-cost DWARF exception handling.
567  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
568  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
569
570  // Darwin ABI issue.
571  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
572  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
573  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
574  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
575  if (Subtarget->is64Bit())
576    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
577  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
578  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
579  if (Subtarget->is64Bit()) {
580    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
581    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
582    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
583    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
584    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
585  }
586  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
587  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
588  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
589  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
590  if (Subtarget->is64Bit()) {
591    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
592    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
593    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
594  }
595
596  if (Subtarget->hasSSE1())
597    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
598
599  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
600
601  // Expand certain atomics
602  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
603    MVT VT = IntVTs[i];
604    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
605    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
606    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
607  }
608
609  if (Subtarget->hasCmpxchg16b()) {
610    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
611  }
612
613  // FIXME - use subtarget debug flags
614  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
615      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
616    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
617  }
618
619  if (Subtarget->is64Bit()) {
620    setExceptionPointerRegister(X86::RAX);
621    setExceptionSelectorRegister(X86::RDX);
622  } else {
623    setExceptionPointerRegister(X86::EAX);
624    setExceptionSelectorRegister(X86::EDX);
625  }
626  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
627  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
628
629  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
630  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
631
632  setOperationAction(ISD::TRAP, MVT::Other, Legal);
633  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
634
635  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
636  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
637  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
638  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
639    // TargetInfo::X86_64ABIBuiltinVaList
640    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
641    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
642  } else {
643    // TargetInfo::CharPtrBuiltinVaList
644    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
645    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
646  }
647
648  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
649  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
650
651  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
652                     MVT::i64 : MVT::i32, Custom);
653
654  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
655    // f32 and f64 use SSE.
656    // Set up the FP register classes.
657    addRegisterClass(MVT::f32, &X86::FR32RegClass);
658    addRegisterClass(MVT::f64, &X86::FR64RegClass);
659
660    // Use ANDPD to simulate FABS.
661    setOperationAction(ISD::FABS , MVT::f64, Custom);
662    setOperationAction(ISD::FABS , MVT::f32, Custom);
663
664    // Use XORP to simulate FNEG.
665    setOperationAction(ISD::FNEG , MVT::f64, Custom);
666    setOperationAction(ISD::FNEG , MVT::f32, Custom);
667
668    // Use ANDPD and ORPD to simulate FCOPYSIGN.
669    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
670    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
671
672    // Lower this to FGETSIGNx86 plus an AND.
673    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
674    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
675
676    // We don't support sin/cos/fmod
677    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
678    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
679    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
680    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
681    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
682    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
683
684    // Expand FP immediates into loads from the stack, except for the special
685    // cases we handle.
686    addLegalFPImmediate(APFloat(+0.0)); // xorpd
687    addLegalFPImmediate(APFloat(+0.0f)); // xorps
688  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
689    // Use SSE for f32, x87 for f64.
690    // Set up the FP register classes.
691    addRegisterClass(MVT::f32, &X86::FR32RegClass);
692    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
693
694    // Use ANDPS to simulate FABS.
695    setOperationAction(ISD::FABS , MVT::f32, Custom);
696
697    // Use XORP to simulate FNEG.
698    setOperationAction(ISD::FNEG , MVT::f32, Custom);
699
700    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
701
702    // Use ANDPS and ORPS to simulate FCOPYSIGN.
703    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
704    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
705
706    // We don't support sin/cos/fmod
707    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
708    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
709    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
710
711    // Special cases we handle for FP constants.
712    addLegalFPImmediate(APFloat(+0.0f)); // xorps
713    addLegalFPImmediate(APFloat(+0.0)); // FLD0
714    addLegalFPImmediate(APFloat(+1.0)); // FLD1
715    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
716    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
717
718    if (!TM.Options.UnsafeFPMath) {
719      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
720      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
721      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
722    }
723  } else if (!TM.Options.UseSoftFloat) {
724    // f32 and f64 in x87.
725    // Set up the FP register classes.
726    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
727    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
728
729    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
730    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
731    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
732    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
733
734    if (!TM.Options.UnsafeFPMath) {
735      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
736      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
737      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
738      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
739      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
740      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
741    }
742    addLegalFPImmediate(APFloat(+0.0)); // FLD0
743    addLegalFPImmediate(APFloat(+1.0)); // FLD1
744    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
745    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
746    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
747    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
748    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
749    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
750  }
751
752  // We don't support FMA.
753  setOperationAction(ISD::FMA, MVT::f64, Expand);
754  setOperationAction(ISD::FMA, MVT::f32, Expand);
755
756  // Long double always uses X87.
757  if (!TM.Options.UseSoftFloat) {
758    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
759    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
760    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
761    {
762      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
763      addLegalFPImmediate(TmpFlt);  // FLD0
764      TmpFlt.changeSign();
765      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
766
767      bool ignored;
768      APFloat TmpFlt2(+1.0);
769      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
770                      &ignored);
771      addLegalFPImmediate(TmpFlt2);  // FLD1
772      TmpFlt2.changeSign();
773      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
774    }
775
776    if (!TM.Options.UnsafeFPMath) {
777      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
778      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
779      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
780    }
781
782    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
783    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
784    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
785    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
786    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
787    setOperationAction(ISD::FMA, MVT::f80, Expand);
788  }
789
790  // Always use a library call for pow.
791  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
792  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
793  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
794
795  setOperationAction(ISD::FLOG, MVT::f80, Expand);
796  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
797  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
798  setOperationAction(ISD::FEXP, MVT::f80, Expand);
799  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
800
801  // First set operation action for all vector types to either promote
802  // (for widening) or expand (for scalarization). Then we will selectively
803  // turn on ones that can be effectively codegen'd.
804  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
805           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
806    MVT VT = (MVT::SimpleValueType)i;
807    setOperationAction(ISD::ADD , VT, Expand);
808    setOperationAction(ISD::SUB , VT, Expand);
809    setOperationAction(ISD::FADD, VT, Expand);
810    setOperationAction(ISD::FNEG, VT, Expand);
811    setOperationAction(ISD::FSUB, VT, Expand);
812    setOperationAction(ISD::MUL , VT, Expand);
813    setOperationAction(ISD::FMUL, VT, Expand);
814    setOperationAction(ISD::SDIV, VT, Expand);
815    setOperationAction(ISD::UDIV, VT, Expand);
816    setOperationAction(ISD::FDIV, VT, Expand);
817    setOperationAction(ISD::SREM, VT, Expand);
818    setOperationAction(ISD::UREM, VT, Expand);
819    setOperationAction(ISD::LOAD, VT, Expand);
820    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
821    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
822    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
823    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
824    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
825    setOperationAction(ISD::FABS, VT, Expand);
826    setOperationAction(ISD::FSIN, VT, Expand);
827    setOperationAction(ISD::FSINCOS, VT, Expand);
828    setOperationAction(ISD::FCOS, VT, Expand);
829    setOperationAction(ISD::FSINCOS, VT, Expand);
830    setOperationAction(ISD::FREM, VT, Expand);
831    setOperationAction(ISD::FMA,  VT, Expand);
832    setOperationAction(ISD::FPOWI, VT, Expand);
833    setOperationAction(ISD::FSQRT, VT, Expand);
834    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
835    setOperationAction(ISD::FFLOOR, VT, Expand);
836    setOperationAction(ISD::FCEIL, VT, Expand);
837    setOperationAction(ISD::FTRUNC, VT, Expand);
838    setOperationAction(ISD::FRINT, VT, Expand);
839    setOperationAction(ISD::FNEARBYINT, VT, Expand);
840    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
841    setOperationAction(ISD::MULHS, VT, Expand);
842    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
843    setOperationAction(ISD::MULHU, VT, Expand);
844    setOperationAction(ISD::SDIVREM, VT, Expand);
845    setOperationAction(ISD::UDIVREM, VT, Expand);
846    setOperationAction(ISD::FPOW, VT, Expand);
847    setOperationAction(ISD::CTPOP, VT, Expand);
848    setOperationAction(ISD::CTTZ, VT, Expand);
849    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
850    setOperationAction(ISD::CTLZ, VT, Expand);
851    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
852    setOperationAction(ISD::SHL, VT, Expand);
853    setOperationAction(ISD::SRA, VT, Expand);
854    setOperationAction(ISD::SRL, VT, Expand);
855    setOperationAction(ISD::ROTL, VT, Expand);
856    setOperationAction(ISD::ROTR, VT, Expand);
857    setOperationAction(ISD::BSWAP, VT, Expand);
858    setOperationAction(ISD::SETCC, VT, Expand);
859    setOperationAction(ISD::FLOG, VT, Expand);
860    setOperationAction(ISD::FLOG2, VT, Expand);
861    setOperationAction(ISD::FLOG10, VT, Expand);
862    setOperationAction(ISD::FEXP, VT, Expand);
863    setOperationAction(ISD::FEXP2, VT, Expand);
864    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
865    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
866    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
867    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
868    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
869    setOperationAction(ISD::TRUNCATE, VT, Expand);
870    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
871    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
872    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
873    setOperationAction(ISD::VSELECT, VT, Expand);
874    setOperationAction(ISD::SELECT_CC, VT, Expand);
875    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
876             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
877      setTruncStoreAction(VT,
878                          (MVT::SimpleValueType)InnerVT, Expand);
879    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
880    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
881    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
882  }
883
884  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
885  // with -msoft-float, disable use of MMX as well.
886  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
887    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
888    // No operations on x86mmx supported, everything uses intrinsics.
889  }
890
891  // MMX-sized vectors (other than x86mmx) are expected to be expanded
892  // into smaller operations.
893  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
894  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
895  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
896  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
897  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
898  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
899  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
900  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
901  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
902  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
903  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
904  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
905  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
906  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
907  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
908  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
909  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
910  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
911  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
912  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
913  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
914  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
915  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
916  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
917  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
918  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
919  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
920  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
921  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
922
923  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
924    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
925
926    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
927    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
928    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
929    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
930    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
931    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
932    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
933    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
934    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
935    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
936    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
937    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
938  }
939
940  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
941    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
942
943    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
944    // registers cannot be used even for integer operations.
945    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
946    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
947    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
948    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
949
950    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
951    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
952    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
953    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
954    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
955    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
956    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
957    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
958    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
959    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
960    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
961    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
962    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
963    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
964    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
965    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
966    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
967    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
968    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
969    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
970    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
971    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
972
973    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
974    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
975    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
976    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
977
978    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
979    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
980    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
981    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
982    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
983
984    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
985    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
986      MVT VT = (MVT::SimpleValueType)i;
987      // Do not attempt to custom lower non-power-of-2 vectors
988      if (!isPowerOf2_32(VT.getVectorNumElements()))
989        continue;
990      // Do not attempt to custom lower non-128-bit vectors
991      if (!VT.is128BitVector())
992        continue;
993      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
994      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
995      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
996    }
997
998    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
999    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1000    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1001    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1002    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1003    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1004
1005    if (Subtarget->is64Bit()) {
1006      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1007      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1008    }
1009
1010    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1011    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1012      MVT VT = (MVT::SimpleValueType)i;
1013
1014      // Do not attempt to promote non-128-bit vectors
1015      if (!VT.is128BitVector())
1016        continue;
1017
1018      setOperationAction(ISD::AND,    VT, Promote);
1019      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1020      setOperationAction(ISD::OR,     VT, Promote);
1021      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1022      setOperationAction(ISD::XOR,    VT, Promote);
1023      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1024      setOperationAction(ISD::LOAD,   VT, Promote);
1025      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1026      setOperationAction(ISD::SELECT, VT, Promote);
1027      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1028    }
1029
1030    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1031
1032    // Custom lower v2i64 and v2f64 selects.
1033    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1034    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1035    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1036    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1037
1038    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1039    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1040
1041    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1042    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1043    // As there is no 64-bit GPR available, we need build a special custom
1044    // sequence to convert from v2i32 to v2f32.
1045    if (!Subtarget->is64Bit())
1046      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1047
1048    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1049    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1050
1051    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
1052
1053    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1054    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1055    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1056  }
1057
1058  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1059    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1060    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1061    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1062    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1063    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1064    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1065    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1066    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1067    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1068    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1069
1070    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1071    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1072    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1073    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1074    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1075    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1076    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1077    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1078    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1079    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1080
1081    // FIXME: Do we need to handle scalar-to-vector here?
1082    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1083
1084    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1085    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1086    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1087    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1088    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1089    // There is no BLENDI for byte vectors. We don't need to custom lower
1090    // some vselects for now.
1091    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1092
1093    // i8 and i16 vectors are custom , because the source register and source
1094    // source memory operand types are not the same width.  f32 vectors are
1095    // custom since the immediate controlling the insert encodes additional
1096    // information.
1097    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1098    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1099    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1100    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1101
1102    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1103    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1104    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1105    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1106
1107    // FIXME: these should be Legal but thats only for the case where
1108    // the index is constant.  For now custom expand to deal with that.
1109    if (Subtarget->is64Bit()) {
1110      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1111      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1112    }
1113  }
1114
1115  if (Subtarget->hasSSE2()) {
1116    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1117    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1118
1119    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1120    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1121
1122    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1123    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1124
1125    // In the customized shift lowering, the legal cases in AVX2 will be
1126    // recognized.
1127    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1128    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1129
1130    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1131    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1132
1133    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1134  }
1135
1136  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1137    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1138    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1139    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1140    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1141    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1142    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1143
1144    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1145    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1146    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1147
1148    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1149    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1150    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1151    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1152    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1153    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1154    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1155    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1156    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1157    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1158    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1159    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1160
1161    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1162    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1163    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1164    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1165    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1166    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1167    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1168    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1169    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1170    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1171    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1172    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1173
1174    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1175    // even though v8i16 is a legal type.
1176    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1177    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1178    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1179
1180    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1181    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1182    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1183
1184    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1185    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1186
1187    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
1188
1189    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1190    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1191
1192    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1193    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1194
1195    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1196    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1197
1198    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1199    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1200    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1201    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1202
1203    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1204    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1205    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1206
1207    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1208    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1209    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1210    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1211
1212    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1213    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1214    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1215    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1216    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1217    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1218    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1219    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1220    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1221    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1222    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1223    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1224
1225    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1226      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1227      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1228      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1229      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1230      setOperationAction(ISD::FMA,             MVT::f32, Legal);
1231      setOperationAction(ISD::FMA,             MVT::f64, Legal);
1232    }
1233
1234    if (Subtarget->hasInt256()) {
1235      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1236      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1237      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1238      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1239
1240      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1241      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1242      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1243      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1244
1245      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1246      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1247      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1248      // Don't lower v32i8 because there is no 128-bit byte mul
1249
1250      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1251      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1252      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1253      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1254
1255      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1256      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1257    } else {
1258      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1259      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1260      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1261      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1262
1263      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1264      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1265      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1266      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1267
1268      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1269      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1270      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1271      // Don't lower v32i8 because there is no 128-bit byte mul
1272    }
1273
1274    // In the customized shift lowering, the legal cases in AVX2 will be
1275    // recognized.
1276    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1277    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1278
1279    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1280    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1281
1282    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1283
1284    // Custom lower several nodes for 256-bit types.
1285    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1286             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1287      MVT VT = (MVT::SimpleValueType)i;
1288
1289      // Extract subvector is special because the value type
1290      // (result) is 128-bit but the source is 256-bit wide.
1291      if (VT.is128BitVector())
1292        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1293
1294      // Do not attempt to custom lower other non-256-bit vectors
1295      if (!VT.is256BitVector())
1296        continue;
1297
1298      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1299      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1300      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1301      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1302      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1303      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1304      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1305    }
1306
1307    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1308    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1309      MVT VT = (MVT::SimpleValueType)i;
1310
1311      // Do not attempt to promote non-256-bit vectors
1312      if (!VT.is256BitVector())
1313        continue;
1314
1315      setOperationAction(ISD::AND,    VT, Promote);
1316      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1317      setOperationAction(ISD::OR,     VT, Promote);
1318      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1319      setOperationAction(ISD::XOR,    VT, Promote);
1320      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1321      setOperationAction(ISD::LOAD,   VT, Promote);
1322      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1323      setOperationAction(ISD::SELECT, VT, Promote);
1324      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1325    }
1326  }
1327
1328  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1329    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1330    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1331    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1332    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1333
1334    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1335    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1336    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1337
1338    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1339    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1340    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1341    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1342    setOperationAction(ISD::AND,                MVT::i1,    Legal);
1343    setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
1344    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1345    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1346    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1347    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1348    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1349
1350    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1351    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1352    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1353    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1354    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1355    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1356
1357    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1358    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1359    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1360    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1361    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1362    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1363    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1364    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1365
1366    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1367    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1368    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1369    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1370    if (Subtarget->is64Bit()) {
1371      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1372      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1373      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1374      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1375    }
1376    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1377    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1378    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1379    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1380    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1381    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1382    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1383    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1384    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1385    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1386
1387    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1388    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1389    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1390    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1391    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1392    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1393    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1394    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1395    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1396    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1397    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1398    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1399    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1400
1401    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1402    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1403    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1404    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1405    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1406    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1407
1408    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1409    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1410
1411    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1412
1413    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1414    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1415    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1416    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1417    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1418    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1419    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1420    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1421    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1422
1423    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1424    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1425
1426    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1427    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1428
1429    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1430
1431    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1432    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1433
1434    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1435    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1436
1437    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1438    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1439
1440    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1441    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1442    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1443    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1444    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1445    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1446
1447    if (Subtarget->hasCDI()) {
1448      setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1449      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1450    }
1451
1452    // Custom lower several nodes.
1453    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1454             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1455      MVT VT = (MVT::SimpleValueType)i;
1456
1457      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1458      // Extract subvector is special because the value type
1459      // (result) is 256/128-bit but the source is 512-bit wide.
1460      if (VT.is128BitVector() || VT.is256BitVector())
1461        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1462
1463      if (VT.getVectorElementType() == MVT::i1)
1464        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1465
1466      // Do not attempt to custom lower other non-512-bit vectors
1467      if (!VT.is512BitVector())
1468        continue;
1469
1470      if ( EltSize >= 32) {
1471        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1472        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1473        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1474        setOperationAction(ISD::VSELECT,             VT, Legal);
1475        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1476        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1477        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1478      }
1479    }
1480    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1481      MVT VT = (MVT::SimpleValueType)i;
1482
1483      // Do not attempt to promote non-256-bit vectors
1484      if (!VT.is512BitVector())
1485        continue;
1486
1487      setOperationAction(ISD::SELECT, VT, Promote);
1488      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1489    }
1490  }// has  AVX-512
1491
1492  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1493  // of this type with custom code.
1494  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1495           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1496    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1497                       Custom);
1498  }
1499
1500  // We want to custom lower some of our intrinsics.
1501  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1502  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1503  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1504  if (!Subtarget->is64Bit())
1505    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1506
1507  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1508  // handle type legalization for these operations here.
1509  //
1510  // FIXME: We really should do custom legalization for addition and
1511  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1512  // than generic legalization for 64-bit multiplication-with-overflow, though.
1513  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1514    // Add/Sub/Mul with overflow operations are custom lowered.
1515    MVT VT = IntVTs[i];
1516    setOperationAction(ISD::SADDO, VT, Custom);
1517    setOperationAction(ISD::UADDO, VT, Custom);
1518    setOperationAction(ISD::SSUBO, VT, Custom);
1519    setOperationAction(ISD::USUBO, VT, Custom);
1520    setOperationAction(ISD::SMULO, VT, Custom);
1521    setOperationAction(ISD::UMULO, VT, Custom);
1522  }
1523
1524  // There are no 8-bit 3-address imul/mul instructions
1525  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1526  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1527
1528  if (!Subtarget->is64Bit()) {
1529    // These libcalls are not available in 32-bit.
1530    setLibcallName(RTLIB::SHL_I128, nullptr);
1531    setLibcallName(RTLIB::SRL_I128, nullptr);
1532    setLibcallName(RTLIB::SRA_I128, nullptr);
1533  }
1534
1535  // Combine sin / cos into one node or libcall if possible.
1536  if (Subtarget->hasSinCos()) {
1537    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1538    setLibcallName(RTLIB::SINCOS_F64, "sincos");
1539    if (Subtarget->isTargetDarwin()) {
1540      // For MacOSX, we don't want to the normal expansion of a libcall to
1541      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
1542      // traffic.
1543      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1544      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1545    }
1546  }
1547
1548  if (Subtarget->isTargetWin64()) {
1549    setOperationAction(ISD::SDIV, MVT::i128, Custom);
1550    setOperationAction(ISD::UDIV, MVT::i128, Custom);
1551    setOperationAction(ISD::SREM, MVT::i128, Custom);
1552    setOperationAction(ISD::UREM, MVT::i128, Custom);
1553    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1554    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1555  }
1556
1557  // We have target-specific dag combine patterns for the following nodes:
1558  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1559  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1560  setTargetDAGCombine(ISD::VSELECT);
1561  setTargetDAGCombine(ISD::SELECT);
1562  setTargetDAGCombine(ISD::SHL);
1563  setTargetDAGCombine(ISD::SRA);
1564  setTargetDAGCombine(ISD::SRL);
1565  setTargetDAGCombine(ISD::OR);
1566  setTargetDAGCombine(ISD::AND);
1567  setTargetDAGCombine(ISD::ADD);
1568  setTargetDAGCombine(ISD::FADD);
1569  setTargetDAGCombine(ISD::FSUB);
1570  setTargetDAGCombine(ISD::FMA);
1571  setTargetDAGCombine(ISD::SUB);
1572  setTargetDAGCombine(ISD::LOAD);
1573  setTargetDAGCombine(ISD::STORE);
1574  setTargetDAGCombine(ISD::ZERO_EXTEND);
1575  setTargetDAGCombine(ISD::ANY_EXTEND);
1576  setTargetDAGCombine(ISD::SIGN_EXTEND);
1577  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1578  setTargetDAGCombine(ISD::TRUNCATE);
1579  setTargetDAGCombine(ISD::SINT_TO_FP);
1580  setTargetDAGCombine(ISD::SETCC);
1581  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1582  setTargetDAGCombine(ISD::BUILD_VECTOR);
1583  if (Subtarget->is64Bit())
1584    setTargetDAGCombine(ISD::MUL);
1585  setTargetDAGCombine(ISD::XOR);
1586
1587  computeRegisterProperties();
1588
1589  // On Darwin, -Os means optimize for size without hurting performance,
1590  // do not reduce the limit.
1591  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1592  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1593  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1594  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1595  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1596  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1597  setPrefLoopAlignment(4); // 2^4 bytes.
1598
1599  // Predictable cmov don't hurt on atom because it's in-order.
1600  PredictableSelectIsExpensive = !Subtarget->isAtom();
1601
1602  setPrefFunctionAlignment(4); // 2^4 bytes.
1603}
1604
1605TargetLoweringBase::LegalizeTypeAction
1606X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1607  if (ExperimentalVectorWideningLegalization &&
1608      VT.getVectorNumElements() != 1 &&
1609      VT.getVectorElementType().getSimpleVT() != MVT::i1)
1610    return TypeWidenVector;
1611
1612  return TargetLoweringBase::getPreferredVectorAction(VT);
1613}
1614
1615EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1616  if (!VT.isVector())
1617    return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1618
1619  if (Subtarget->hasAVX512())
1620    switch(VT.getVectorNumElements()) {
1621    case  8: return MVT::v8i1;
1622    case 16: return MVT::v16i1;
1623  }
1624
1625  return VT.changeVectorElementTypeToInteger();
1626}
1627
1628/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1629/// the desired ByVal argument alignment.
1630static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1631  if (MaxAlign == 16)
1632    return;
1633  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1634    if (VTy->getBitWidth() == 128)
1635      MaxAlign = 16;
1636  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1637    unsigned EltAlign = 0;
1638    getMaxByValAlign(ATy->getElementType(), EltAlign);
1639    if (EltAlign > MaxAlign)
1640      MaxAlign = EltAlign;
1641  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1642    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1643      unsigned EltAlign = 0;
1644      getMaxByValAlign(STy->getElementType(i), EltAlign);
1645      if (EltAlign > MaxAlign)
1646        MaxAlign = EltAlign;
1647      if (MaxAlign == 16)
1648        break;
1649    }
1650  }
1651}
1652
1653/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1654/// function arguments in the caller parameter area. For X86, aggregates
1655/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1656/// are at 4-byte boundaries.
1657unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1658  if (Subtarget->is64Bit()) {
1659    // Max of 8 and alignment of type.
1660    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1661    if (TyAlign > 8)
1662      return TyAlign;
1663    return 8;
1664  }
1665
1666  unsigned Align = 4;
1667  if (Subtarget->hasSSE1())
1668    getMaxByValAlign(Ty, Align);
1669  return Align;
1670}
1671
1672/// getOptimalMemOpType - Returns the target specific optimal type for load
1673/// and store operations as a result of memset, memcpy, and memmove
1674/// lowering. If DstAlign is zero that means it's safe to destination
1675/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1676/// means there isn't a need to check it against alignment requirement,
1677/// probably because the source does not need to be loaded. If 'IsMemset' is
1678/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1679/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1680/// source is constant so it does not need to be loaded.
1681/// It returns EVT::Other if the type should be determined using generic
1682/// target-independent logic.
1683EVT
1684X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1685                                       unsigned DstAlign, unsigned SrcAlign,
1686                                       bool IsMemset, bool ZeroMemset,
1687                                       bool MemcpyStrSrc,
1688                                       MachineFunction &MF) const {
1689  const Function *F = MF.getFunction();
1690  if ((!IsMemset || ZeroMemset) &&
1691      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1692                                       Attribute::NoImplicitFloat)) {
1693    if (Size >= 16 &&
1694        (Subtarget->isUnalignedMemAccessFast() ||
1695         ((DstAlign == 0 || DstAlign >= 16) &&
1696          (SrcAlign == 0 || SrcAlign >= 16)))) {
1697      if (Size >= 32) {
1698        if (Subtarget->hasInt256())
1699          return MVT::v8i32;
1700        if (Subtarget->hasFp256())
1701          return MVT::v8f32;
1702      }
1703      if (Subtarget->hasSSE2())
1704        return MVT::v4i32;
1705      if (Subtarget->hasSSE1())
1706        return MVT::v4f32;
1707    } else if (!MemcpyStrSrc && Size >= 8 &&
1708               !Subtarget->is64Bit() &&
1709               Subtarget->hasSSE2()) {
1710      // Do not use f64 to lower memcpy if source is string constant. It's
1711      // better to use i32 to avoid the loads.
1712      return MVT::f64;
1713    }
1714  }
1715  if (Subtarget->is64Bit() && Size >= 8)
1716    return MVT::i64;
1717  return MVT::i32;
1718}
1719
1720bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1721  if (VT == MVT::f32)
1722    return X86ScalarSSEf32;
1723  else if (VT == MVT::f64)
1724    return X86ScalarSSEf64;
1725  return true;
1726}
1727
1728bool
1729X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
1730                                                 unsigned,
1731                                                 bool *Fast) const {
1732  if (Fast)
1733    *Fast = Subtarget->isUnalignedMemAccessFast();
1734  return true;
1735}
1736
1737/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1738/// current function.  The returned value is a member of the
1739/// MachineJumpTableInfo::JTEntryKind enum.
1740unsigned X86TargetLowering::getJumpTableEncoding() const {
1741  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1742  // symbol.
1743  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1744      Subtarget->isPICStyleGOT())
1745    return MachineJumpTableInfo::EK_Custom32;
1746
1747  // Otherwise, use the normal jump table encoding heuristics.
1748  return TargetLowering::getJumpTableEncoding();
1749}
1750
1751const MCExpr *
1752X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1753                                             const MachineBasicBlock *MBB,
1754                                             unsigned uid,MCContext &Ctx) const{
1755  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1756         Subtarget->isPICStyleGOT());
1757  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1758  // entries.
1759  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1760                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1761}
1762
1763/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1764/// jumptable.
1765SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1766                                                    SelectionDAG &DAG) const {
1767  if (!Subtarget->is64Bit())
1768    // This doesn't have SDLoc associated with it, but is not really the
1769    // same as a Register.
1770    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1771  return Table;
1772}
1773
1774/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1775/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1776/// MCExpr.
1777const MCExpr *X86TargetLowering::
1778getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1779                             MCContext &Ctx) const {
1780  // X86-64 uses RIP relative addressing based on the jump table label.
1781  if (Subtarget->isPICStyleRIPRel())
1782    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1783
1784  // Otherwise, the reference is relative to the PIC base.
1785  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1786}
1787
1788// FIXME: Why this routine is here? Move to RegInfo!
1789std::pair<const TargetRegisterClass*, uint8_t>
1790X86TargetLowering::findRepresentativeClass(MVT VT) const{
1791  const TargetRegisterClass *RRC = nullptr;
1792  uint8_t Cost = 1;
1793  switch (VT.SimpleTy) {
1794  default:
1795    return TargetLowering::findRepresentativeClass(VT);
1796  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1797    RRC = Subtarget->is64Bit() ?
1798      (const TargetRegisterClass*)&X86::GR64RegClass :
1799      (const TargetRegisterClass*)&X86::GR32RegClass;
1800    break;
1801  case MVT::x86mmx:
1802    RRC = &X86::VR64RegClass;
1803    break;
1804  case MVT::f32: case MVT::f64:
1805  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1806  case MVT::v4f32: case MVT::v2f64:
1807  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1808  case MVT::v4f64:
1809    RRC = &X86::VR128RegClass;
1810    break;
1811  }
1812  return std::make_pair(RRC, Cost);
1813}
1814
1815bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1816                                               unsigned &Offset) const {
1817  if (!Subtarget->isTargetLinux())
1818    return false;
1819
1820  if (Subtarget->is64Bit()) {
1821    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1822    Offset = 0x28;
1823    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1824      AddressSpace = 256;
1825    else
1826      AddressSpace = 257;
1827  } else {
1828    // %gs:0x14 on i386
1829    Offset = 0x14;
1830    AddressSpace = 256;
1831  }
1832  return true;
1833}
1834
1835bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1836                                            unsigned DestAS) const {
1837  assert(SrcAS != DestAS && "Expected different address spaces!");
1838
1839  return SrcAS < 256 && DestAS < 256;
1840}
1841
1842//===----------------------------------------------------------------------===//
1843//               Return Value Calling Convention Implementation
1844//===----------------------------------------------------------------------===//
1845
1846#include "X86GenCallingConv.inc"
1847
1848bool
1849X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1850                                  MachineFunction &MF, bool isVarArg,
1851                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1852                        LLVMContext &Context) const {
1853  SmallVector<CCValAssign, 16> RVLocs;
1854  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
1855                 RVLocs, Context);
1856  return CCInfo.CheckReturn(Outs, RetCC_X86);
1857}
1858
1859const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
1860  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
1861  return ScratchRegs;
1862}
1863
1864SDValue
1865X86TargetLowering::LowerReturn(SDValue Chain,
1866                               CallingConv::ID CallConv, bool isVarArg,
1867                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1868                               const SmallVectorImpl<SDValue> &OutVals,
1869                               SDLoc dl, SelectionDAG &DAG) const {
1870  MachineFunction &MF = DAG.getMachineFunction();
1871  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1872
1873  SmallVector<CCValAssign, 16> RVLocs;
1874  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
1875                 RVLocs, *DAG.getContext());
1876  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1877
1878  SDValue Flag;
1879  SmallVector<SDValue, 6> RetOps;
1880  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1881  // Operand #1 = Bytes To Pop
1882  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1883                   MVT::i16));
1884
1885  // Copy the result values into the output registers.
1886  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1887    CCValAssign &VA = RVLocs[i];
1888    assert(VA.isRegLoc() && "Can only return in registers!");
1889    SDValue ValToCopy = OutVals[i];
1890    EVT ValVT = ValToCopy.getValueType();
1891
1892    // Promote values to the appropriate types
1893    if (VA.getLocInfo() == CCValAssign::SExt)
1894      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1895    else if (VA.getLocInfo() == CCValAssign::ZExt)
1896      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1897    else if (VA.getLocInfo() == CCValAssign::AExt)
1898      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1899    else if (VA.getLocInfo() == CCValAssign::BCvt)
1900      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1901
1902    assert(VA.getLocInfo() != CCValAssign::FPExt &&
1903           "Unexpected FP-extend for return value.");
1904
1905    // If this is x86-64, and we disabled SSE, we can't return FP values,
1906    // or SSE or MMX vectors.
1907    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1908         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1909          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1910      report_fatal_error("SSE register return with SSE disabled");
1911    }
1912    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1913    // llvm-gcc has never done it right and no one has noticed, so this
1914    // should be OK for now.
1915    if (ValVT == MVT::f64 &&
1916        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1917      report_fatal_error("SSE2 register return with SSE2 disabled");
1918
1919    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1920    // the RET instruction and handled by the FP Stackifier.
1921    if (VA.getLocReg() == X86::ST0 ||
1922        VA.getLocReg() == X86::ST1) {
1923      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1924      // change the value to the FP stack register class.
1925      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1926        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1927      RetOps.push_back(ValToCopy);
1928      // Don't emit a copytoreg.
1929      continue;
1930    }
1931
1932    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1933    // which is returned in RAX / RDX.
1934    if (Subtarget->is64Bit()) {
1935      if (ValVT == MVT::x86mmx) {
1936        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1937          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1938          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1939                                  ValToCopy);
1940          // If we don't have SSE2 available, convert to v4f32 so the generated
1941          // register is legal.
1942          if (!Subtarget->hasSSE2())
1943            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1944        }
1945      }
1946    }
1947
1948    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1949    Flag = Chain.getValue(1);
1950    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1951  }
1952
1953  // The x86-64 ABIs require that for returning structs by value we copy
1954  // the sret argument into %rax/%eax (depending on ABI) for the return.
1955  // Win32 requires us to put the sret argument to %eax as well.
1956  // We saved the argument into a virtual register in the entry block,
1957  // so now we copy the value out and into %rax/%eax.
1958  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
1959      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
1960    MachineFunction &MF = DAG.getMachineFunction();
1961    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1962    unsigned Reg = FuncInfo->getSRetReturnReg();
1963    assert(Reg &&
1964           "SRetReturnReg should have been set in LowerFormalArguments().");
1965    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1966
1967    unsigned RetValReg
1968        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
1969          X86::RAX : X86::EAX;
1970    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
1971    Flag = Chain.getValue(1);
1972
1973    // RAX/EAX now acts like a return value.
1974    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
1975  }
1976
1977  RetOps[0] = Chain;  // Update chain.
1978
1979  // Add the flag if we have it.
1980  if (Flag.getNode())
1981    RetOps.push_back(Flag);
1982
1983  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
1984}
1985
1986bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1987  if (N->getNumValues() != 1)
1988    return false;
1989  if (!N->hasNUsesOfValue(1, 0))
1990    return false;
1991
1992  SDValue TCChain = Chain;
1993  SDNode *Copy = *N->use_begin();
1994  if (Copy->getOpcode() == ISD::CopyToReg) {
1995    // If the copy has a glue operand, we conservatively assume it isn't safe to
1996    // perform a tail call.
1997    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1998      return false;
1999    TCChain = Copy->getOperand(0);
2000  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2001    return false;
2002
2003  bool HasRet = false;
2004  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2005       UI != UE; ++UI) {
2006    if (UI->getOpcode() != X86ISD::RET_FLAG)
2007      return false;
2008    HasRet = true;
2009  }
2010
2011  if (!HasRet)
2012    return false;
2013
2014  Chain = TCChain;
2015  return true;
2016}
2017
2018MVT
2019X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
2020                                            ISD::NodeType ExtendKind) const {
2021  MVT ReturnMVT;
2022  // TODO: Is this also valid on 32-bit?
2023  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2024    ReturnMVT = MVT::i8;
2025  else
2026    ReturnMVT = MVT::i32;
2027
2028  MVT MinVT = getRegisterType(ReturnMVT);
2029  return VT.bitsLT(MinVT) ? MinVT : VT;
2030}
2031
2032/// LowerCallResult - Lower the result values of a call into the
2033/// appropriate copies out of appropriate physical registers.
2034///
2035SDValue
2036X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2037                                   CallingConv::ID CallConv, bool isVarArg,
2038                                   const SmallVectorImpl<ISD::InputArg> &Ins,
2039                                   SDLoc dl, SelectionDAG &DAG,
2040                                   SmallVectorImpl<SDValue> &InVals) const {
2041
2042  // Assign locations to each value returned by this call.
2043  SmallVector<CCValAssign, 16> RVLocs;
2044  bool Is64Bit = Subtarget->is64Bit();
2045  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2046                 DAG.getTarget(), RVLocs, *DAG.getContext());
2047  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2048
2049  // Copy all of the result registers out of their specified physreg.
2050  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2051    CCValAssign &VA = RVLocs[i];
2052    EVT CopyVT = VA.getValVT();
2053
2054    // If this is x86-64, and we disabled SSE, we can't return FP values
2055    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2056        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2057      report_fatal_error("SSE register return with SSE disabled");
2058    }
2059
2060    SDValue Val;
2061
2062    // If this is a call to a function that returns an fp value on the floating
2063    // point stack, we must guarantee the value is popped from the stack, so
2064    // a CopyFromReg is not good enough - the copy instruction may be eliminated
2065    // if the return value is not used. We use the FpPOP_RETVAL instruction
2066    // instead.
2067    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
2068      // If we prefer to use the value in xmm registers, copy it out as f80 and
2069      // use a truncate to move it from fp stack reg to xmm reg.
2070      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
2071      SDValue Ops[] = { Chain, InFlag };
2072      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
2073                                         MVT::Other, MVT::Glue, Ops), 1);
2074      Val = Chain.getValue(0);
2075
2076      // Round the f80 to the right size, which also moves it to the appropriate
2077      // xmm register.
2078      if (CopyVT != VA.getValVT())
2079        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2080                          // This truncation won't change the value.
2081                          DAG.getIntPtrConstant(1));
2082    } else {
2083      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2084                                 CopyVT, InFlag).getValue(1);
2085      Val = Chain.getValue(0);
2086    }
2087    InFlag = Chain.getValue(2);
2088    InVals.push_back(Val);
2089  }
2090
2091  return Chain;
2092}
2093
2094//===----------------------------------------------------------------------===//
2095//                C & StdCall & Fast Calling Convention implementation
2096//===----------------------------------------------------------------------===//
2097//  StdCall calling convention seems to be standard for many Windows' API
2098//  routines and around. It differs from C calling convention just a little:
2099//  callee should clean up the stack, not caller. Symbols should be also
2100//  decorated in some fancy way :) It doesn't support any vector arguments.
2101//  For info on fast calling convention see Fast Calling Convention (tail call)
2102//  implementation LowerX86_32FastCCCallTo.
2103
2104/// CallIsStructReturn - Determines whether a call uses struct return
2105/// semantics.
2106enum StructReturnType {
2107  NotStructReturn,
2108  RegStructReturn,
2109  StackStructReturn
2110};
2111static StructReturnType
2112callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2113  if (Outs.empty())
2114    return NotStructReturn;
2115
2116  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2117  if (!Flags.isSRet())
2118    return NotStructReturn;
2119  if (Flags.isInReg())
2120    return RegStructReturn;
2121  return StackStructReturn;
2122}
2123
2124/// ArgsAreStructReturn - Determines whether a function uses struct
2125/// return semantics.
2126static StructReturnType
2127argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2128  if (Ins.empty())
2129    return NotStructReturn;
2130
2131  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2132  if (!Flags.isSRet())
2133    return NotStructReturn;
2134  if (Flags.isInReg())
2135    return RegStructReturn;
2136  return StackStructReturn;
2137}
2138
2139/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
2140/// by "Src" to address "Dst" with size and alignment information specified by
2141/// the specific parameter attribute. The copy will be passed as a byval
2142/// function parameter.
2143static SDValue
2144CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2145                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2146                          SDLoc dl) {
2147  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2148
2149  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2150                       /*isVolatile*/false, /*AlwaysInline=*/true,
2151                       MachinePointerInfo(), MachinePointerInfo());
2152}
2153
2154/// IsTailCallConvention - Return true if the calling convention is one that
2155/// supports tail call optimization.
2156static bool IsTailCallConvention(CallingConv::ID CC) {
2157  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2158          CC == CallingConv::HiPE);
2159}
2160
2161/// \brief Return true if the calling convention is a C calling convention.
2162static bool IsCCallConvention(CallingConv::ID CC) {
2163  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2164          CC == CallingConv::X86_64_SysV);
2165}
2166
2167bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2168  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2169    return false;
2170
2171  CallSite CS(CI);
2172  CallingConv::ID CalleeCC = CS.getCallingConv();
2173  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2174    return false;
2175
2176  return true;
2177}
2178
2179/// FuncIsMadeTailCallSafe - Return true if the function is being made into
2180/// a tailcall target by changing its ABI.
2181static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2182                                   bool GuaranteedTailCallOpt) {
2183  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2184}
2185
2186SDValue
2187X86TargetLowering::LowerMemArgument(SDValue Chain,
2188                                    CallingConv::ID CallConv,
2189                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2190                                    SDLoc dl, SelectionDAG &DAG,
2191                                    const CCValAssign &VA,
2192                                    MachineFrameInfo *MFI,
2193                                    unsigned i) const {
2194  // Create the nodes corresponding to a load from this parameter slot.
2195  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2196  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2197      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2198  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2199  EVT ValVT;
2200
2201  // If value is passed by pointer we have address passed instead of the value
2202  // itself.
2203  if (VA.getLocInfo() == CCValAssign::Indirect)
2204    ValVT = VA.getLocVT();
2205  else
2206    ValVT = VA.getValVT();
2207
2208  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2209  // changed with more analysis.
2210  // In case of tail call optimization mark all arguments mutable. Since they
2211  // could be overwritten by lowering of arguments in case of a tail call.
2212  if (Flags.isByVal()) {
2213    unsigned Bytes = Flags.getByValSize();
2214    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2215    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2216    return DAG.getFrameIndex(FI, getPointerTy());
2217  } else {
2218    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2219                                    VA.getLocMemOffset(), isImmutable);
2220    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2221    return DAG.getLoad(ValVT, dl, Chain, FIN,
2222                       MachinePointerInfo::getFixedStack(FI),
2223                       false, false, false, 0);
2224  }
2225}
2226
2227SDValue
2228X86TargetLowering::LowerFormalArguments(SDValue Chain,
2229                                        CallingConv::ID CallConv,
2230                                        bool isVarArg,
2231                                      const SmallVectorImpl<ISD::InputArg> &Ins,
2232                                        SDLoc dl,
2233                                        SelectionDAG &DAG,
2234                                        SmallVectorImpl<SDValue> &InVals)
2235                                          const {
2236  MachineFunction &MF = DAG.getMachineFunction();
2237  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2238
2239  const Function* Fn = MF.getFunction();
2240  if (Fn->hasExternalLinkage() &&
2241      Subtarget->isTargetCygMing() &&
2242      Fn->getName() == "main")
2243    FuncInfo->setForceFramePointer(true);
2244
2245  MachineFrameInfo *MFI = MF.getFrameInfo();
2246  bool Is64Bit = Subtarget->is64Bit();
2247  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2248
2249  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2250         "Var args not supported with calling convention fastcc, ghc or hipe");
2251
2252  // Assign locations to all of the incoming arguments.
2253  SmallVector<CCValAssign, 16> ArgLocs;
2254  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
2255                 ArgLocs, *DAG.getContext());
2256
2257  // Allocate shadow area for Win64
2258  if (IsWin64)
2259    CCInfo.AllocateStack(32, 8);
2260
2261  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2262
2263  unsigned LastVal = ~0U;
2264  SDValue ArgValue;
2265  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2266    CCValAssign &VA = ArgLocs[i];
2267    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2268    // places.
2269    assert(VA.getValNo() != LastVal &&
2270           "Don't support value assigned to multiple locs yet");
2271    (void)LastVal;
2272    LastVal = VA.getValNo();
2273
2274    if (VA.isRegLoc()) {
2275      EVT RegVT = VA.getLocVT();
2276      const TargetRegisterClass *RC;
2277      if (RegVT == MVT::i32)
2278        RC = &X86::GR32RegClass;
2279      else if (Is64Bit && RegVT == MVT::i64)
2280        RC = &X86::GR64RegClass;
2281      else if (RegVT == MVT::f32)
2282        RC = &X86::FR32RegClass;
2283      else if (RegVT == MVT::f64)
2284        RC = &X86::FR64RegClass;
2285      else if (RegVT.is512BitVector())
2286        RC = &X86::VR512RegClass;
2287      else if (RegVT.is256BitVector())
2288        RC = &X86::VR256RegClass;
2289      else if (RegVT.is128BitVector())
2290        RC = &X86::VR128RegClass;
2291      else if (RegVT == MVT::x86mmx)
2292        RC = &X86::VR64RegClass;
2293      else if (RegVT == MVT::i1)
2294        RC = &X86::VK1RegClass;
2295      else if (RegVT == MVT::v8i1)
2296        RC = &X86::VK8RegClass;
2297      else if (RegVT == MVT::v16i1)
2298        RC = &X86::VK16RegClass;
2299      else
2300        llvm_unreachable("Unknown argument type!");
2301
2302      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2303      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2304
2305      // If this is an 8 or 16-bit value, it is really passed promoted to 32
2306      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2307      // right size.
2308      if (VA.getLocInfo() == CCValAssign::SExt)
2309        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2310                               DAG.getValueType(VA.getValVT()));
2311      else if (VA.getLocInfo() == CCValAssign::ZExt)
2312        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2313                               DAG.getValueType(VA.getValVT()));
2314      else if (VA.getLocInfo() == CCValAssign::BCvt)
2315        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2316
2317      if (VA.isExtInLoc()) {
2318        // Handle MMX values passed in XMM regs.
2319        if (RegVT.isVector())
2320          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2321        else
2322          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2323      }
2324    } else {
2325      assert(VA.isMemLoc());
2326      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2327    }
2328
2329    // If value is passed via pointer - do a load.
2330    if (VA.getLocInfo() == CCValAssign::Indirect)
2331      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2332                             MachinePointerInfo(), false, false, false, 0);
2333
2334    InVals.push_back(ArgValue);
2335  }
2336
2337  if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2338    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2339      // The x86-64 ABIs require that for returning structs by value we copy
2340      // the sret argument into %rax/%eax (depending on ABI) for the return.
2341      // Win32 requires us to put the sret argument to %eax as well.
2342      // Save the argument into a virtual register so that we can access it
2343      // from the return points.
2344      if (Ins[i].Flags.isSRet()) {
2345        unsigned Reg = FuncInfo->getSRetReturnReg();
2346        if (!Reg) {
2347          MVT PtrTy = getPointerTy();
2348          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2349          FuncInfo->setSRetReturnReg(Reg);
2350        }
2351        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2352        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2353        break;
2354      }
2355    }
2356  }
2357
2358  unsigned StackSize = CCInfo.getNextStackOffset();
2359  // Align stack specially for tail calls.
2360  if (FuncIsMadeTailCallSafe(CallConv,
2361                             MF.getTarget().Options.GuaranteedTailCallOpt))
2362    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2363
2364  // If the function takes variable number of arguments, make a frame index for
2365  // the start of the first vararg value... for expansion of llvm.va_start.
2366  if (isVarArg) {
2367    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2368                    CallConv != CallingConv::X86_ThisCall)) {
2369      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
2370    }
2371    if (Is64Bit) {
2372      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
2373
2374      // FIXME: We should really autogenerate these arrays
2375      static const MCPhysReg GPR64ArgRegsWin64[] = {
2376        X86::RCX, X86::RDX, X86::R8,  X86::R9
2377      };
2378      static const MCPhysReg GPR64ArgRegs64Bit[] = {
2379        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2380      };
2381      static const MCPhysReg XMMArgRegs64Bit[] = {
2382        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2383        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2384      };
2385      const MCPhysReg *GPR64ArgRegs;
2386      unsigned NumXMMRegs = 0;
2387
2388      if (IsWin64) {
2389        // The XMM registers which might contain var arg parameters are shadowed
2390        // in their paired GPR.  So we only need to save the GPR to their home
2391        // slots.
2392        TotalNumIntRegs = 4;
2393        GPR64ArgRegs = GPR64ArgRegsWin64;
2394      } else {
2395        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
2396        GPR64ArgRegs = GPR64ArgRegs64Bit;
2397
2398        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
2399                                                TotalNumXMMRegs);
2400      }
2401      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
2402                                                       TotalNumIntRegs);
2403
2404      bool NoImplicitFloatOps = Fn->getAttributes().
2405        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2406      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2407             "SSE register cannot be used when SSE is disabled!");
2408      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
2409               NoImplicitFloatOps) &&
2410             "SSE register cannot be used when SSE is disabled!");
2411      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2412          !Subtarget->hasSSE1())
2413        // Kernel mode asks for SSE to be disabled, so don't push them
2414        // on the stack.
2415        TotalNumXMMRegs = 0;
2416
2417      if (IsWin64) {
2418        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
2419        // Get to the caller-allocated home save location.  Add 8 to account
2420        // for the return address.
2421        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2422        FuncInfo->setRegSaveFrameIndex(
2423          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2424        // Fixup to set vararg frame on shadow area (4 x i64).
2425        if (NumIntRegs < 4)
2426          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2427      } else {
2428        // For X86-64, if there are vararg parameters that are passed via
2429        // registers, then we must store them to their spots on the stack so
2430        // they may be loaded by deferencing the result of va_next.
2431        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2432        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2433        FuncInfo->setRegSaveFrameIndex(
2434          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2435                               false));
2436      }
2437
2438      // Store the integer parameter registers.
2439      SmallVector<SDValue, 8> MemOps;
2440      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2441                                        getPointerTy());
2442      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2443      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2444        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2445                                  DAG.getIntPtrConstant(Offset));
2446        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2447                                     &X86::GR64RegClass);
2448        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2449        SDValue Store =
2450          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2451                       MachinePointerInfo::getFixedStack(
2452                         FuncInfo->getRegSaveFrameIndex(), Offset),
2453                       false, false, 0);
2454        MemOps.push_back(Store);
2455        Offset += 8;
2456      }
2457
2458      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2459        // Now store the XMM (fp + vector) parameter registers.
2460        SmallVector<SDValue, 11> SaveXMMOps;
2461        SaveXMMOps.push_back(Chain);
2462
2463        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2464        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2465        SaveXMMOps.push_back(ALVal);
2466
2467        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2468                               FuncInfo->getRegSaveFrameIndex()));
2469        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2470                               FuncInfo->getVarArgsFPOffset()));
2471
2472        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2473          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2474                                       &X86::VR128RegClass);
2475          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2476          SaveXMMOps.push_back(Val);
2477        }
2478        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2479                                     MVT::Other, SaveXMMOps));
2480      }
2481
2482      if (!MemOps.empty())
2483        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2484    }
2485  }
2486
2487  // Some CCs need callee pop.
2488  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2489                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2490    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2491  } else {
2492    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2493    // If this is an sret function, the return should pop the hidden pointer.
2494    if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2495        !Subtarget->getTargetTriple().isOSMSVCRT() &&
2496        argsAreStructReturn(Ins) == StackStructReturn)
2497      FuncInfo->setBytesToPopOnReturn(4);
2498  }
2499
2500  if (!Is64Bit) {
2501    // RegSaveFrameIndex is X86-64 only.
2502    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2503    if (CallConv == CallingConv::X86_FastCall ||
2504        CallConv == CallingConv::X86_ThisCall)
2505      // fastcc functions can't have varargs.
2506      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2507  }
2508
2509  FuncInfo->setArgumentStackSize(StackSize);
2510
2511  return Chain;
2512}
2513
2514SDValue
2515X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2516                                    SDValue StackPtr, SDValue Arg,
2517                                    SDLoc dl, SelectionDAG &DAG,
2518                                    const CCValAssign &VA,
2519                                    ISD::ArgFlagsTy Flags) const {
2520  unsigned LocMemOffset = VA.getLocMemOffset();
2521  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2522  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2523  if (Flags.isByVal())
2524    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2525
2526  return DAG.getStore(Chain, dl, Arg, PtrOff,
2527                      MachinePointerInfo::getStack(LocMemOffset),
2528                      false, false, 0);
2529}
2530
2531/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2532/// optimization is performed and it is required.
2533SDValue
2534X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2535                                           SDValue &OutRetAddr, SDValue Chain,
2536                                           bool IsTailCall, bool Is64Bit,
2537                                           int FPDiff, SDLoc dl) const {
2538  // Adjust the Return address stack slot.
2539  EVT VT = getPointerTy();
2540  OutRetAddr = getReturnAddressFrameIndex(DAG);
2541
2542  // Load the "old" Return address.
2543  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2544                           false, false, false, 0);
2545  return SDValue(OutRetAddr.getNode(), 1);
2546}
2547
2548/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2549/// optimization is performed and it is required (FPDiff!=0).
2550static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2551                                        SDValue Chain, SDValue RetAddrFrIdx,
2552                                        EVT PtrVT, unsigned SlotSize,
2553                                        int FPDiff, SDLoc dl) {
2554  // Store the return address to the appropriate stack slot.
2555  if (!FPDiff) return Chain;
2556  // Calculate the new stack slot for the return address.
2557  int NewReturnAddrFI =
2558    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2559                                         false);
2560  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2561  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2562                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2563                       false, false, 0);
2564  return Chain;
2565}
2566
2567SDValue
2568X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2569                             SmallVectorImpl<SDValue> &InVals) const {
2570  SelectionDAG &DAG                     = CLI.DAG;
2571  SDLoc &dl                             = CLI.DL;
2572  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2573  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2574  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2575  SDValue Chain                         = CLI.Chain;
2576  SDValue Callee                        = CLI.Callee;
2577  CallingConv::ID CallConv              = CLI.CallConv;
2578  bool &isTailCall                      = CLI.IsTailCall;
2579  bool isVarArg                         = CLI.IsVarArg;
2580
2581  MachineFunction &MF = DAG.getMachineFunction();
2582  bool Is64Bit        = Subtarget->is64Bit();
2583  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2584  StructReturnType SR = callIsStructReturn(Outs);
2585  bool IsSibcall      = false;
2586
2587  if (MF.getTarget().Options.DisableTailCalls)
2588    isTailCall = false;
2589
2590  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2591  if (IsMustTail) {
2592    // Force this to be a tail call.  The verifier rules are enough to ensure
2593    // that we can lower this successfully without moving the return address
2594    // around.
2595    isTailCall = true;
2596  } else if (isTailCall) {
2597    // Check if it's really possible to do a tail call.
2598    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2599                    isVarArg, SR != NotStructReturn,
2600                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2601                    Outs, OutVals, Ins, DAG);
2602
2603    // Sibcalls are automatically detected tailcalls which do not require
2604    // ABI changes.
2605    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2606      IsSibcall = true;
2607
2608    if (isTailCall)
2609      ++NumTailCalls;
2610  }
2611
2612  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2613         "Var args not supported with calling convention fastcc, ghc or hipe");
2614
2615  // Analyze operands of the call, assigning locations to each operand.
2616  SmallVector<CCValAssign, 16> ArgLocs;
2617  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
2618                 ArgLocs, *DAG.getContext());
2619
2620  // Allocate shadow area for Win64
2621  if (IsWin64)
2622    CCInfo.AllocateStack(32, 8);
2623
2624  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2625
2626  // Get a count of how many bytes are to be pushed on the stack.
2627  unsigned NumBytes = CCInfo.getNextStackOffset();
2628  if (IsSibcall)
2629    // This is a sibcall. The memory operands are available in caller's
2630    // own caller's stack.
2631    NumBytes = 0;
2632  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2633           IsTailCallConvention(CallConv))
2634    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2635
2636  int FPDiff = 0;
2637  if (isTailCall && !IsSibcall && !IsMustTail) {
2638    // Lower arguments at fp - stackoffset + fpdiff.
2639    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2640    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2641
2642    FPDiff = NumBytesCallerPushed - NumBytes;
2643
2644    // Set the delta of movement of the returnaddr stackslot.
2645    // But only set if delta is greater than previous delta.
2646    if (FPDiff < X86Info->getTCReturnAddrDelta())
2647      X86Info->setTCReturnAddrDelta(FPDiff);
2648  }
2649
2650  unsigned NumBytesToPush = NumBytes;
2651  unsigned NumBytesToPop = NumBytes;
2652
2653  // If we have an inalloca argument, all stack space has already been allocated
2654  // for us and be right at the top of the stack.  We don't support multiple
2655  // arguments passed in memory when using inalloca.
2656  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2657    NumBytesToPush = 0;
2658    assert(ArgLocs.back().getLocMemOffset() == 0 &&
2659           "an inalloca argument must be the only memory argument");
2660  }
2661
2662  if (!IsSibcall)
2663    Chain = DAG.getCALLSEQ_START(
2664        Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2665
2666  SDValue RetAddrFrIdx;
2667  // Load return address for tail calls.
2668  if (isTailCall && FPDiff)
2669    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2670                                    Is64Bit, FPDiff, dl);
2671
2672  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2673  SmallVector<SDValue, 8> MemOpChains;
2674  SDValue StackPtr;
2675
2676  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2677  // of tail call optimization arguments are handle later.
2678  const X86RegisterInfo *RegInfo =
2679    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
2680  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2681    // Skip inalloca arguments, they have already been written.
2682    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2683    if (Flags.isInAlloca())
2684      continue;
2685
2686    CCValAssign &VA = ArgLocs[i];
2687    EVT RegVT = VA.getLocVT();
2688    SDValue Arg = OutVals[i];
2689    bool isByVal = Flags.isByVal();
2690
2691    // Promote the value if needed.
2692    switch (VA.getLocInfo()) {
2693    default: llvm_unreachable("Unknown loc info!");
2694    case CCValAssign::Full: break;
2695    case CCValAssign::SExt:
2696      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2697      break;
2698    case CCValAssign::ZExt:
2699      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2700      break;
2701    case CCValAssign::AExt:
2702      if (RegVT.is128BitVector()) {
2703        // Special case: passing MMX values in XMM registers.
2704        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2705        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2706        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2707      } else
2708        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2709      break;
2710    case CCValAssign::BCvt:
2711      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2712      break;
2713    case CCValAssign::Indirect: {
2714      // Store the argument.
2715      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2716      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2717      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2718                           MachinePointerInfo::getFixedStack(FI),
2719                           false, false, 0);
2720      Arg = SpillSlot;
2721      break;
2722    }
2723    }
2724
2725    if (VA.isRegLoc()) {
2726      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2727      if (isVarArg && IsWin64) {
2728        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2729        // shadow reg if callee is a varargs function.
2730        unsigned ShadowReg = 0;
2731        switch (VA.getLocReg()) {
2732        case X86::XMM0: ShadowReg = X86::RCX; break;
2733        case X86::XMM1: ShadowReg = X86::RDX; break;
2734        case X86::XMM2: ShadowReg = X86::R8; break;
2735        case X86::XMM3: ShadowReg = X86::R9; break;
2736        }
2737        if (ShadowReg)
2738          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2739      }
2740    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2741      assert(VA.isMemLoc());
2742      if (!StackPtr.getNode())
2743        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2744                                      getPointerTy());
2745      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2746                                             dl, DAG, VA, Flags));
2747    }
2748  }
2749
2750  if (!MemOpChains.empty())
2751    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2752
2753  if (Subtarget->isPICStyleGOT()) {
2754    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2755    // GOT pointer.
2756    if (!isTailCall) {
2757      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2758               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2759    } else {
2760      // If we are tail calling and generating PIC/GOT style code load the
2761      // address of the callee into ECX. The value in ecx is used as target of
2762      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2763      // for tail calls on PIC/GOT architectures. Normally we would just put the
2764      // address of GOT into ebx and then call target@PLT. But for tail calls
2765      // ebx would be restored (since ebx is callee saved) before jumping to the
2766      // target@PLT.
2767
2768      // Note: The actual moving to ECX is done further down.
2769      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2770      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2771          !G->getGlobal()->hasProtectedVisibility())
2772        Callee = LowerGlobalAddress(Callee, DAG);
2773      else if (isa<ExternalSymbolSDNode>(Callee))
2774        Callee = LowerExternalSymbol(Callee, DAG);
2775    }
2776  }
2777
2778  if (Is64Bit && isVarArg && !IsWin64) {
2779    // From AMD64 ABI document:
2780    // For calls that may call functions that use varargs or stdargs
2781    // (prototype-less calls or calls to functions containing ellipsis (...) in
2782    // the declaration) %al is used as hidden argument to specify the number
2783    // of SSE registers used. The contents of %al do not need to match exactly
2784    // the number of registers, but must be an ubound on the number of SSE
2785    // registers used and is in the range 0 - 8 inclusive.
2786
2787    // Count the number of XMM registers allocated.
2788    static const MCPhysReg XMMArgRegs[] = {
2789      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2790      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2791    };
2792    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2793    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2794           && "SSE registers cannot be used when SSE is disabled");
2795
2796    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2797                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
2798  }
2799
2800  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
2801  // don't need this because the eligibility check rejects calls that require
2802  // shuffling arguments passed in memory.
2803  if (!IsSibcall && isTailCall) {
2804    // Force all the incoming stack arguments to be loaded from the stack
2805    // before any new outgoing arguments are stored to the stack, because the
2806    // outgoing stack slots may alias the incoming argument stack slots, and
2807    // the alias isn't otherwise explicit. This is slightly more conservative
2808    // than necessary, because it means that each store effectively depends
2809    // on every argument instead of just those arguments it would clobber.
2810    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2811
2812    SmallVector<SDValue, 8> MemOpChains2;
2813    SDValue FIN;
2814    int FI = 0;
2815    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2816      CCValAssign &VA = ArgLocs[i];
2817      if (VA.isRegLoc())
2818        continue;
2819      assert(VA.isMemLoc());
2820      SDValue Arg = OutVals[i];
2821      ISD::ArgFlagsTy Flags = Outs[i].Flags;
2822      // Skip inalloca arguments.  They don't require any work.
2823      if (Flags.isInAlloca())
2824        continue;
2825      // Create frame index.
2826      int32_t Offset = VA.getLocMemOffset()+FPDiff;
2827      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2828      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2829      FIN = DAG.getFrameIndex(FI, getPointerTy());
2830
2831      if (Flags.isByVal()) {
2832        // Copy relative to framepointer.
2833        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2834        if (!StackPtr.getNode())
2835          StackPtr = DAG.getCopyFromReg(Chain, dl,
2836                                        RegInfo->getStackRegister(),
2837                                        getPointerTy());
2838        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2839
2840        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2841                                                         ArgChain,
2842                                                         Flags, DAG, dl));
2843      } else {
2844        // Store relative to framepointer.
2845        MemOpChains2.push_back(
2846          DAG.getStore(ArgChain, dl, Arg, FIN,
2847                       MachinePointerInfo::getFixedStack(FI),
2848                       false, false, 0));
2849      }
2850    }
2851
2852    if (!MemOpChains2.empty())
2853      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
2854
2855    // Store the return address to the appropriate stack slot.
2856    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2857                                     getPointerTy(), RegInfo->getSlotSize(),
2858                                     FPDiff, dl);
2859  }
2860
2861  // Build a sequence of copy-to-reg nodes chained together with token chain
2862  // and flag operands which copy the outgoing args into registers.
2863  SDValue InFlag;
2864  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2865    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2866                             RegsToPass[i].second, InFlag);
2867    InFlag = Chain.getValue(1);
2868  }
2869
2870  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2871    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2872    // In the 64-bit large code model, we have to make all calls
2873    // through a register, since the call instruction's 32-bit
2874    // pc-relative offset may not be large enough to hold the whole
2875    // address.
2876  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2877    // If the callee is a GlobalAddress node (quite common, every direct call
2878    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2879    // it.
2880
2881    // We should use extra load for direct calls to dllimported functions in
2882    // non-JIT mode.
2883    const GlobalValue *GV = G->getGlobal();
2884    if (!GV->hasDLLImportStorageClass()) {
2885      unsigned char OpFlags = 0;
2886      bool ExtraLoad = false;
2887      unsigned WrapperKind = ISD::DELETED_NODE;
2888
2889      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2890      // external symbols most go through the PLT in PIC mode.  If the symbol
2891      // has hidden or protected visibility, or if it is static or local, then
2892      // we don't need to use the PLT - we can directly call it.
2893      if (Subtarget->isTargetELF() &&
2894          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
2895          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2896        OpFlags = X86II::MO_PLT;
2897      } else if (Subtarget->isPICStyleStubAny() &&
2898                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2899                 (!Subtarget->getTargetTriple().isMacOSX() ||
2900                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2901        // PC-relative references to external symbols should go through $stub,
2902        // unless we're building with the leopard linker or later, which
2903        // automatically synthesizes these stubs.
2904        OpFlags = X86II::MO_DARWIN_STUB;
2905      } else if (Subtarget->isPICStyleRIPRel() &&
2906                 isa<Function>(GV) &&
2907                 cast<Function>(GV)->getAttributes().
2908                   hasAttribute(AttributeSet::FunctionIndex,
2909                                Attribute::NonLazyBind)) {
2910        // If the function is marked as non-lazy, generate an indirect call
2911        // which loads from the GOT directly. This avoids runtime overhead
2912        // at the cost of eager binding (and one extra byte of encoding).
2913        OpFlags = X86II::MO_GOTPCREL;
2914        WrapperKind = X86ISD::WrapperRIP;
2915        ExtraLoad = true;
2916      }
2917
2918      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2919                                          G->getOffset(), OpFlags);
2920
2921      // Add a wrapper if needed.
2922      if (WrapperKind != ISD::DELETED_NODE)
2923        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2924      // Add extra indirection if needed.
2925      if (ExtraLoad)
2926        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2927                             MachinePointerInfo::getGOT(),
2928                             false, false, false, 0);
2929    }
2930  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2931    unsigned char OpFlags = 0;
2932
2933    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2934    // external symbols should go through the PLT.
2935    if (Subtarget->isTargetELF() &&
2936        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
2937      OpFlags = X86II::MO_PLT;
2938    } else if (Subtarget->isPICStyleStubAny() &&
2939               (!Subtarget->getTargetTriple().isMacOSX() ||
2940                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2941      // PC-relative references to external symbols should go through $stub,
2942      // unless we're building with the leopard linker or later, which
2943      // automatically synthesizes these stubs.
2944      OpFlags = X86II::MO_DARWIN_STUB;
2945    }
2946
2947    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2948                                         OpFlags);
2949  }
2950
2951  // Returns a chain & a flag for retval copy to use.
2952  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2953  SmallVector<SDValue, 8> Ops;
2954
2955  if (!IsSibcall && isTailCall) {
2956    Chain = DAG.getCALLSEQ_END(Chain,
2957                               DAG.getIntPtrConstant(NumBytesToPop, true),
2958                               DAG.getIntPtrConstant(0, true), InFlag, dl);
2959    InFlag = Chain.getValue(1);
2960  }
2961
2962  Ops.push_back(Chain);
2963  Ops.push_back(Callee);
2964
2965  if (isTailCall)
2966    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2967
2968  // Add argument registers to the end of the list so that they are known live
2969  // into the call.
2970  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2971    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2972                                  RegsToPass[i].second.getValueType()));
2973
2974  // Add a register mask operand representing the call-preserved registers.
2975  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
2976  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2977  assert(Mask && "Missing call preserved mask for calling convention");
2978  Ops.push_back(DAG.getRegisterMask(Mask));
2979
2980  if (InFlag.getNode())
2981    Ops.push_back(InFlag);
2982
2983  if (isTailCall) {
2984    // We used to do:
2985    //// If this is the first return lowered for this function, add the regs
2986    //// to the liveout set for the function.
2987    // This isn't right, although it's probably harmless on x86; liveouts
2988    // should be computed from returns not tail calls.  Consider a void
2989    // function making a tail call to a function returning int.
2990    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
2991  }
2992
2993  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
2994  InFlag = Chain.getValue(1);
2995
2996  // Create the CALLSEQ_END node.
2997  unsigned NumBytesForCalleeToPop;
2998  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2999                       DAG.getTarget().Options.GuaranteedTailCallOpt))
3000    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3001  else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3002           !Subtarget->getTargetTriple().isOSMSVCRT() &&
3003           SR == StackStructReturn)
3004    // If this is a call to a struct-return function, the callee
3005    // pops the hidden struct pointer, so we have to push it back.
3006    // This is common for Darwin/X86, Linux & Mingw32 targets.
3007    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3008    NumBytesForCalleeToPop = 4;
3009  else
3010    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3011
3012  // Returns a flag for retval copy to use.
3013  if (!IsSibcall) {
3014    Chain = DAG.getCALLSEQ_END(Chain,
3015                               DAG.getIntPtrConstant(NumBytesToPop, true),
3016                               DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3017                                                     true),
3018                               InFlag, dl);
3019    InFlag = Chain.getValue(1);
3020  }
3021
3022  // Handle result values, copying them out of physregs into vregs that we
3023  // return.
3024  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3025                         Ins, dl, DAG, InVals);
3026}
3027
3028//===----------------------------------------------------------------------===//
3029//                Fast Calling Convention (tail call) implementation
3030//===----------------------------------------------------------------------===//
3031
3032//  Like std call, callee cleans arguments, convention except that ECX is
3033//  reserved for storing the tail called function address. Only 2 registers are
3034//  free for argument passing (inreg). Tail call optimization is performed
3035//  provided:
3036//                * tailcallopt is enabled
3037//                * caller/callee are fastcc
3038//  On X86_64 architecture with GOT-style position independent code only local
3039//  (within module) calls are supported at the moment.
3040//  To keep the stack aligned according to platform abi the function
3041//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3042//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3043//  If a tail called function callee has more arguments than the caller the
3044//  caller needs to make sure that there is room to move the RETADDR to. This is
3045//  achieved by reserving an area the size of the argument delta right after the
3046//  original REtADDR, but before the saved framepointer or the spilled registers
3047//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3048//  stack layout:
3049//    arg1
3050//    arg2
3051//    RETADDR
3052//    [ new RETADDR
3053//      move area ]
3054//    (possible EBP)
3055//    ESI
3056//    EDI
3057//    local1 ..
3058
3059/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3060/// for a 16 byte align requirement.
3061unsigned
3062X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3063                                               SelectionDAG& DAG) const {
3064  MachineFunction &MF = DAG.getMachineFunction();
3065  const TargetMachine &TM = MF.getTarget();
3066  const X86RegisterInfo *RegInfo =
3067    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
3068  const TargetFrameLowering &TFI = *TM.getFrameLowering();
3069  unsigned StackAlignment = TFI.getStackAlignment();
3070  uint64_t AlignMask = StackAlignment - 1;
3071  int64_t Offset = StackSize;
3072  unsigned SlotSize = RegInfo->getSlotSize();
3073  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3074    // Number smaller than 12 so just add the difference.
3075    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3076  } else {
3077    // Mask out lower bits, add stackalignment once plus the 12 bytes.
3078    Offset = ((~AlignMask) & Offset) + StackAlignment +
3079      (StackAlignment-SlotSize);
3080  }
3081  return Offset;
3082}
3083
3084/// MatchingStackOffset - Return true if the given stack call argument is
3085/// already available in the same position (relatively) of the caller's
3086/// incoming argument stack.
3087static
3088bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3089                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3090                         const X86InstrInfo *TII) {
3091  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3092  int FI = INT_MAX;
3093  if (Arg.getOpcode() == ISD::CopyFromReg) {
3094    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3095    if (!TargetRegisterInfo::isVirtualRegister(VR))
3096      return false;
3097    MachineInstr *Def = MRI->getVRegDef(VR);
3098    if (!Def)
3099      return false;
3100    if (!Flags.isByVal()) {
3101      if (!TII->isLoadFromStackSlot(Def, FI))
3102        return false;
3103    } else {
3104      unsigned Opcode = Def->getOpcode();
3105      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
3106          Def->getOperand(1).isFI()) {
3107        FI = Def->getOperand(1).getIndex();
3108        Bytes = Flags.getByValSize();
3109      } else
3110        return false;
3111    }
3112  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3113    if (Flags.isByVal())
3114      // ByVal argument is passed in as a pointer but it's now being
3115      // dereferenced. e.g.
3116      // define @foo(%struct.X* %A) {
3117      //   tail call @bar(%struct.X* byval %A)
3118      // }
3119      return false;
3120    SDValue Ptr = Ld->getBasePtr();
3121    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3122    if (!FINode)
3123      return false;
3124    FI = FINode->getIndex();
3125  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3126    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3127    FI = FINode->getIndex();
3128    Bytes = Flags.getByValSize();
3129  } else
3130    return false;
3131
3132  assert(FI != INT_MAX);
3133  if (!MFI->isFixedObjectIndex(FI))
3134    return false;
3135  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3136}
3137
3138/// IsEligibleForTailCallOptimization - Check whether the call is eligible
3139/// for tail call optimization. Targets which want to do tail call
3140/// optimization should implement this function.
3141bool
3142X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3143                                                     CallingConv::ID CalleeCC,
3144                                                     bool isVarArg,
3145                                                     bool isCalleeStructRet,
3146                                                     bool isCallerStructRet,
3147                                                     Type *RetTy,
3148                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
3149                                    const SmallVectorImpl<SDValue> &OutVals,
3150                                    const SmallVectorImpl<ISD::InputArg> &Ins,
3151                                                     SelectionDAG &DAG) const {
3152  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3153    return false;
3154
3155  // If -tailcallopt is specified, make fastcc functions tail-callable.
3156  const MachineFunction &MF = DAG.getMachineFunction();
3157  const Function *CallerF = MF.getFunction();
3158
3159  // If the function return type is x86_fp80 and the callee return type is not,
3160  // then the FP_EXTEND of the call result is not a nop. It's not safe to
3161  // perform a tailcall optimization here.
3162  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3163    return false;
3164
3165  CallingConv::ID CallerCC = CallerF->getCallingConv();
3166  bool CCMatch = CallerCC == CalleeCC;
3167  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3168  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3169
3170  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3171    if (IsTailCallConvention(CalleeCC) && CCMatch)
3172      return true;
3173    return false;
3174  }
3175
3176  // Look for obvious safe cases to perform tail call optimization that do not
3177  // require ABI changes. This is what gcc calls sibcall.
3178
3179  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3180  // emit a special epilogue.
3181  const X86RegisterInfo *RegInfo =
3182    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
3183  if (RegInfo->needsStackRealignment(MF))
3184    return false;
3185
3186  // Also avoid sibcall optimization if either caller or callee uses struct
3187  // return semantics.
3188  if (isCalleeStructRet || isCallerStructRet)
3189    return false;
3190
3191  // An stdcall/thiscall caller is expected to clean up its arguments; the
3192  // callee isn't going to do that.
3193  // FIXME: this is more restrictive than needed. We could produce a tailcall
3194  // when the stack adjustment matches. For example, with a thiscall that takes
3195  // only one argument.
3196  if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3197                   CallerCC == CallingConv::X86_ThisCall))
3198    return false;
3199
3200  // Do not sibcall optimize vararg calls unless all arguments are passed via
3201  // registers.
3202  if (isVarArg && !Outs.empty()) {
3203
3204    // Optimizing for varargs on Win64 is unlikely to be safe without
3205    // additional testing.
3206    if (IsCalleeWin64 || IsCallerWin64)
3207      return false;
3208
3209    SmallVector<CCValAssign, 16> ArgLocs;
3210    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
3211                   DAG.getTarget(), ArgLocs, *DAG.getContext());
3212
3213    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3214    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3215      if (!ArgLocs[i].isRegLoc())
3216        return false;
3217  }
3218
3219  // If the call result is in ST0 / ST1, it needs to be popped off the x87
3220  // stack.  Therefore, if it's not used by the call it is not safe to optimize
3221  // this into a sibcall.
3222  bool Unused = false;
3223  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3224    if (!Ins[i].Used) {
3225      Unused = true;
3226      break;
3227    }
3228  }
3229  if (Unused) {
3230    SmallVector<CCValAssign, 16> RVLocs;
3231    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
3232                   DAG.getTarget(), RVLocs, *DAG.getContext());
3233    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3234    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3235      CCValAssign &VA = RVLocs[i];
3236      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
3237        return false;
3238    }
3239  }
3240
3241  // If the calling conventions do not match, then we'd better make sure the
3242  // results are returned in the same way as what the caller expects.
3243  if (!CCMatch) {
3244    SmallVector<CCValAssign, 16> RVLocs1;
3245    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
3246                    DAG.getTarget(), RVLocs1, *DAG.getContext());
3247    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3248
3249    SmallVector<CCValAssign, 16> RVLocs2;
3250    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
3251                    DAG.getTarget(), RVLocs2, *DAG.getContext());
3252    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3253
3254    if (RVLocs1.size() != RVLocs2.size())
3255      return false;
3256    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3257      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3258        return false;
3259      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3260        return false;
3261      if (RVLocs1[i].isRegLoc()) {
3262        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3263          return false;
3264      } else {
3265        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3266          return false;
3267      }
3268    }
3269  }
3270
3271  // If the callee takes no arguments then go on to check the results of the
3272  // call.
3273  if (!Outs.empty()) {
3274    // Check if stack adjustment is needed. For now, do not do this if any
3275    // argument is passed on the stack.
3276    SmallVector<CCValAssign, 16> ArgLocs;
3277    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
3278                   DAG.getTarget(), ArgLocs, *DAG.getContext());
3279
3280    // Allocate shadow area for Win64
3281    if (IsCalleeWin64)
3282      CCInfo.AllocateStack(32, 8);
3283
3284    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3285    if (CCInfo.getNextStackOffset()) {
3286      MachineFunction &MF = DAG.getMachineFunction();
3287      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3288        return false;
3289
3290      // Check if the arguments are already laid out in the right way as
3291      // the caller's fixed stack objects.
3292      MachineFrameInfo *MFI = MF.getFrameInfo();
3293      const MachineRegisterInfo *MRI = &MF.getRegInfo();
3294      const X86InstrInfo *TII =
3295          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
3296      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3297        CCValAssign &VA = ArgLocs[i];
3298        SDValue Arg = OutVals[i];
3299        ISD::ArgFlagsTy Flags = Outs[i].Flags;
3300        if (VA.getLocInfo() == CCValAssign::Indirect)
3301          return false;
3302        if (!VA.isRegLoc()) {
3303          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3304                                   MFI, MRI, TII))
3305            return false;
3306        }
3307      }
3308    }
3309
3310    // If the tailcall address may be in a register, then make sure it's
3311    // possible to register allocate for it. In 32-bit, the call address can
3312    // only target EAX, EDX, or ECX since the tail call must be scheduled after
3313    // callee-saved registers are restored. These happen to be the same
3314    // registers used to pass 'inreg' arguments so watch out for those.
3315    if (!Subtarget->is64Bit() &&
3316        ((!isa<GlobalAddressSDNode>(Callee) &&
3317          !isa<ExternalSymbolSDNode>(Callee)) ||
3318         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3319      unsigned NumInRegs = 0;
3320      // In PIC we need an extra register to formulate the address computation
3321      // for the callee.
3322      unsigned MaxInRegs =
3323	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3324
3325      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3326        CCValAssign &VA = ArgLocs[i];
3327        if (!VA.isRegLoc())
3328          continue;
3329        unsigned Reg = VA.getLocReg();
3330        switch (Reg) {
3331        default: break;
3332        case X86::EAX: case X86::EDX: case X86::ECX:
3333          if (++NumInRegs == MaxInRegs)
3334            return false;
3335          break;
3336        }
3337      }
3338    }
3339  }
3340
3341  return true;
3342}
3343
3344FastISel *
3345X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3346                                  const TargetLibraryInfo *libInfo) const {
3347  return X86::createFastISel(funcInfo, libInfo);
3348}
3349
3350//===----------------------------------------------------------------------===//
3351//                           Other Lowering Hooks
3352//===----------------------------------------------------------------------===//
3353
3354static bool MayFoldLoad(SDValue Op) {
3355  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3356}
3357
3358static bool MayFoldIntoStore(SDValue Op) {
3359  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3360}
3361
3362static bool isTargetShuffle(unsigned Opcode) {
3363  switch(Opcode) {
3364  default: return false;
3365  case X86ISD::PSHUFD:
3366  case X86ISD::PSHUFHW:
3367  case X86ISD::PSHUFLW:
3368  case X86ISD::SHUFP:
3369  case X86ISD::PALIGNR:
3370  case X86ISD::MOVLHPS:
3371  case X86ISD::MOVLHPD:
3372  case X86ISD::MOVHLPS:
3373  case X86ISD::MOVLPS:
3374  case X86ISD::MOVLPD:
3375  case X86ISD::MOVSHDUP:
3376  case X86ISD::MOVSLDUP:
3377  case X86ISD::MOVDDUP:
3378  case X86ISD::MOVSS:
3379  case X86ISD::MOVSD:
3380  case X86ISD::UNPCKL:
3381  case X86ISD::UNPCKH:
3382  case X86ISD::VPERMILP:
3383  case X86ISD::VPERM2X128:
3384  case X86ISD::VPERMI:
3385    return true;
3386  }
3387}
3388
3389static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3390                                    SDValue V1, SelectionDAG &DAG) {
3391  switch(Opc) {
3392  default: llvm_unreachable("Unknown x86 shuffle node");
3393  case X86ISD::MOVSHDUP:
3394  case X86ISD::MOVSLDUP:
3395  case X86ISD::MOVDDUP:
3396    return DAG.getNode(Opc, dl, VT, V1);
3397  }
3398}
3399
3400static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3401                                    SDValue V1, unsigned TargetMask,
3402                                    SelectionDAG &DAG) {
3403  switch(Opc) {
3404  default: llvm_unreachable("Unknown x86 shuffle node");
3405  case X86ISD::PSHUFD:
3406  case X86ISD::PSHUFHW:
3407  case X86ISD::PSHUFLW:
3408  case X86ISD::VPERMILP:
3409  case X86ISD::VPERMI:
3410    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3411  }
3412}
3413
3414static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3415                                    SDValue V1, SDValue V2, unsigned TargetMask,
3416                                    SelectionDAG &DAG) {
3417  switch(Opc) {
3418  default: llvm_unreachable("Unknown x86 shuffle node");
3419  case X86ISD::PALIGNR:
3420  case X86ISD::SHUFP:
3421  case X86ISD::VPERM2X128:
3422    return DAG.getNode(Opc, dl, VT, V1, V2,
3423                       DAG.getConstant(TargetMask, MVT::i8));
3424  }
3425}
3426
3427static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3428                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
3429  switch(Opc) {
3430  default: llvm_unreachable("Unknown x86 shuffle node");
3431  case X86ISD::MOVLHPS:
3432  case X86ISD::MOVLHPD:
3433  case X86ISD::MOVHLPS:
3434  case X86ISD::MOVLPS:
3435  case X86ISD::MOVLPD:
3436  case X86ISD::MOVSS:
3437  case X86ISD::MOVSD:
3438  case X86ISD::UNPCKL:
3439  case X86ISD::UNPCKH:
3440    return DAG.getNode(Opc, dl, VT, V1, V2);
3441  }
3442}
3443
3444SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3445  MachineFunction &MF = DAG.getMachineFunction();
3446  const X86RegisterInfo *RegInfo =
3447    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
3448  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3449  int ReturnAddrIndex = FuncInfo->getRAIndex();
3450
3451  if (ReturnAddrIndex == 0) {
3452    // Set up a frame object for the return address.
3453    unsigned SlotSize = RegInfo->getSlotSize();
3454    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3455                                                           -(int64_t)SlotSize,
3456                                                           false);
3457    FuncInfo->setRAIndex(ReturnAddrIndex);
3458  }
3459
3460  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3461}
3462
3463bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3464                                       bool hasSymbolicDisplacement) {
3465  // Offset should fit into 32 bit immediate field.
3466  if (!isInt<32>(Offset))
3467    return false;
3468
3469  // If we don't have a symbolic displacement - we don't have any extra
3470  // restrictions.
3471  if (!hasSymbolicDisplacement)
3472    return true;
3473
3474  // FIXME: Some tweaks might be needed for medium code model.
3475  if (M != CodeModel::Small && M != CodeModel::Kernel)
3476    return false;
3477
3478  // For small code model we assume that latest object is 16MB before end of 31
3479  // bits boundary. We may also accept pretty large negative constants knowing
3480  // that all objects are in the positive half of address space.
3481  if (M == CodeModel::Small && Offset < 16*1024*1024)
3482    return true;
3483
3484  // For kernel code model we know that all object resist in the negative half
3485  // of 32bits address space. We may not accept negative offsets, since they may
3486  // be just off and we may accept pretty large positive ones.
3487  if (M == CodeModel::Kernel && Offset > 0)
3488    return true;
3489
3490  return false;
3491}
3492
3493/// isCalleePop - Determines whether the callee is required to pop its
3494/// own arguments. Callee pop is necessary to support tail calls.
3495bool X86::isCalleePop(CallingConv::ID CallingConv,
3496                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3497  if (IsVarArg)
3498    return false;
3499
3500  switch (CallingConv) {
3501  default:
3502    return false;
3503  case CallingConv::X86_StdCall:
3504    return !is64Bit;
3505  case CallingConv::X86_FastCall:
3506    return !is64Bit;
3507  case CallingConv::X86_ThisCall:
3508    return !is64Bit;
3509  case CallingConv::Fast:
3510    return TailCallOpt;
3511  case CallingConv::GHC:
3512    return TailCallOpt;
3513  case CallingConv::HiPE:
3514    return TailCallOpt;
3515  }
3516}
3517
3518/// \brief Return true if the condition is an unsigned comparison operation.
3519static bool isX86CCUnsigned(unsigned X86CC) {
3520  switch (X86CC) {
3521  default: llvm_unreachable("Invalid integer condition!");
3522  case X86::COND_E:     return true;
3523  case X86::COND_G:     return false;
3524  case X86::COND_GE:    return false;
3525  case X86::COND_L:     return false;
3526  case X86::COND_LE:    return false;
3527  case X86::COND_NE:    return true;
3528  case X86::COND_B:     return true;
3529  case X86::COND_A:     return true;
3530  case X86::COND_BE:    return true;
3531  case X86::COND_AE:    return true;
3532  }
3533  llvm_unreachable("covered switch fell through?!");
3534}
3535
3536/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3537/// specific condition code, returning the condition code and the LHS/RHS of the
3538/// comparison to make.
3539static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3540                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3541  if (!isFP) {
3542    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3543      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3544        // X > -1   -> X == 0, jump !sign.
3545        RHS = DAG.getConstant(0, RHS.getValueType());
3546        return X86::COND_NS;
3547      }
3548      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3549        // X < 0   -> X == 0, jump on sign.
3550        return X86::COND_S;
3551      }
3552      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3553        // X < 1   -> X <= 0
3554        RHS = DAG.getConstant(0, RHS.getValueType());
3555        return X86::COND_LE;
3556      }
3557    }
3558
3559    switch (SetCCOpcode) {
3560    default: llvm_unreachable("Invalid integer condition!");
3561    case ISD::SETEQ:  return X86::COND_E;
3562    case ISD::SETGT:  return X86::COND_G;
3563    case ISD::SETGE:  return X86::COND_GE;
3564    case ISD::SETLT:  return X86::COND_L;
3565    case ISD::SETLE:  return X86::COND_LE;
3566    case ISD::SETNE:  return X86::COND_NE;
3567    case ISD::SETULT: return X86::COND_B;
3568    case ISD::SETUGT: return X86::COND_A;
3569    case ISD::SETULE: return X86::COND_BE;
3570    case ISD::SETUGE: return X86::COND_AE;
3571    }
3572  }
3573
3574  // First determine if it is required or is profitable to flip the operands.
3575
3576  // If LHS is a foldable load, but RHS is not, flip the condition.
3577  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3578      !ISD::isNON_EXTLoad(RHS.getNode())) {
3579    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3580    std::swap(LHS, RHS);
3581  }
3582
3583  switch (SetCCOpcode) {
3584  default: break;
3585  case ISD::SETOLT:
3586  case ISD::SETOLE:
3587  case ISD::SETUGT:
3588  case ISD::SETUGE:
3589    std::swap(LHS, RHS);
3590    break;
3591  }
3592
3593  // On a floating point condition, the flags are set as follows:
3594  // ZF  PF  CF   op
3595  //  0 | 0 | 0 | X > Y
3596  //  0 | 0 | 1 | X < Y
3597  //  1 | 0 | 0 | X == Y
3598  //  1 | 1 | 1 | unordered
3599  switch (SetCCOpcode) {
3600  default: llvm_unreachable("Condcode should be pre-legalized away");
3601  case ISD::SETUEQ:
3602  case ISD::SETEQ:   return X86::COND_E;
3603  case ISD::SETOLT:              // flipped
3604  case ISD::SETOGT:
3605  case ISD::SETGT:   return X86::COND_A;
3606  case ISD::SETOLE:              // flipped
3607  case ISD::SETOGE:
3608  case ISD::SETGE:   return X86::COND_AE;
3609  case ISD::SETUGT:              // flipped
3610  case ISD::SETULT:
3611  case ISD::SETLT:   return X86::COND_B;
3612  case ISD::SETUGE:              // flipped
3613  case ISD::SETULE:
3614  case ISD::SETLE:   return X86::COND_BE;
3615  case ISD::SETONE:
3616  case ISD::SETNE:   return X86::COND_NE;
3617  case ISD::SETUO:   return X86::COND_P;
3618  case ISD::SETO:    return X86::COND_NP;
3619  case ISD::SETOEQ:
3620  case ISD::SETUNE:  return X86::COND_INVALID;
3621  }
3622}
3623
3624/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3625/// code. Current x86 isa includes the following FP cmov instructions:
3626/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3627static bool hasFPCMov(unsigned X86CC) {
3628  switch (X86CC) {
3629  default:
3630    return false;
3631  case X86::COND_B:
3632  case X86::COND_BE:
3633  case X86::COND_E:
3634  case X86::COND_P:
3635  case X86::COND_A:
3636  case X86::COND_AE:
3637  case X86::COND_NE:
3638  case X86::COND_NP:
3639    return true;
3640  }
3641}
3642
3643/// isFPImmLegal - Returns true if the target can instruction select the
3644/// specified FP immediate natively. If false, the legalizer will
3645/// materialize the FP immediate as a load from a constant pool.
3646bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3647  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3648    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3649      return true;
3650  }
3651  return false;
3652}
3653
3654/// \brief Returns true if it is beneficial to convert a load of a constant
3655/// to just the constant itself.
3656bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3657                                                          Type *Ty) const {
3658  assert(Ty->isIntegerTy());
3659
3660  unsigned BitSize = Ty->getPrimitiveSizeInBits();
3661  if (BitSize == 0 || BitSize > 64)
3662    return false;
3663  return true;
3664}
3665
3666/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3667/// the specified range (L, H].
3668static bool isUndefOrInRange(int Val, int Low, int Hi) {
3669  return (Val < 0) || (Val >= Low && Val < Hi);
3670}
3671
3672/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3673/// specified value.
3674static bool isUndefOrEqual(int Val, int CmpVal) {
3675  return (Val < 0 || Val == CmpVal);
3676}
3677
3678/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3679/// from position Pos and ending in Pos+Size, falls within the specified
3680/// sequential range (L, L+Pos]. or is undef.
3681static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3682                                       unsigned Pos, unsigned Size, int Low) {
3683  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3684    if (!isUndefOrEqual(Mask[i], Low))
3685      return false;
3686  return true;
3687}
3688
3689/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3690/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3691/// the second operand.
3692static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
3693  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3694    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3695  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3696    return (Mask[0] < 2 && Mask[1] < 2);
3697  return false;
3698}
3699
3700/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3701/// is suitable for input to PSHUFHW.
3702static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3703  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3704    return false;
3705
3706  // Lower quadword copied in order or undef.
3707  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3708    return false;
3709
3710  // Upper quadword shuffled.
3711  for (unsigned i = 4; i != 8; ++i)
3712    if (!isUndefOrInRange(Mask[i], 4, 8))
3713      return false;
3714
3715  if (VT == MVT::v16i16) {
3716    // Lower quadword copied in order or undef.
3717    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3718      return false;
3719
3720    // Upper quadword shuffled.
3721    for (unsigned i = 12; i != 16; ++i)
3722      if (!isUndefOrInRange(Mask[i], 12, 16))
3723        return false;
3724  }
3725
3726  return true;
3727}
3728
3729/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3730/// is suitable for input to PSHUFLW.
3731static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3732  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3733    return false;
3734
3735  // Upper quadword copied in order.
3736  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3737    return false;
3738
3739  // Lower quadword shuffled.
3740  for (unsigned i = 0; i != 4; ++i)
3741    if (!isUndefOrInRange(Mask[i], 0, 4))
3742      return false;
3743
3744  if (VT == MVT::v16i16) {
3745    // Upper quadword copied in order.
3746    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3747      return false;
3748
3749    // Lower quadword shuffled.
3750    for (unsigned i = 8; i != 12; ++i)
3751      if (!isUndefOrInRange(Mask[i], 8, 12))
3752        return false;
3753  }
3754
3755  return true;
3756}
3757
3758/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3759/// is suitable for input to PALIGNR.
3760static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
3761                          const X86Subtarget *Subtarget) {
3762  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
3763      (VT.is256BitVector() && !Subtarget->hasInt256()))
3764    return false;
3765
3766  unsigned NumElts = VT.getVectorNumElements();
3767  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
3768  unsigned NumLaneElts = NumElts/NumLanes;
3769
3770  // Do not handle 64-bit element shuffles with palignr.
3771  if (NumLaneElts == 2)
3772    return false;
3773
3774  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3775    unsigned i;
3776    for (i = 0; i != NumLaneElts; ++i) {
3777      if (Mask[i+l] >= 0)
3778        break;
3779    }
3780
3781    // Lane is all undef, go to next lane
3782    if (i == NumLaneElts)
3783      continue;
3784
3785    int Start = Mask[i+l];
3786
3787    // Make sure its in this lane in one of the sources
3788    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3789        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3790      return false;
3791
3792    // If not lane 0, then we must match lane 0
3793    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3794      return false;
3795
3796    // Correct second source to be contiguous with first source
3797    if (Start >= (int)NumElts)
3798      Start -= NumElts - NumLaneElts;
3799
3800    // Make sure we're shifting in the right direction.
3801    if (Start <= (int)(i+l))
3802      return false;
3803
3804    Start -= i;
3805
3806    // Check the rest of the elements to see if they are consecutive.
3807    for (++i; i != NumLaneElts; ++i) {
3808      int Idx = Mask[i+l];
3809
3810      // Make sure its in this lane
3811      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3812          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3813        return false;
3814
3815      // If not lane 0, then we must match lane 0
3816      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3817        return false;
3818
3819      if (Idx >= (int)NumElts)
3820        Idx -= NumElts - NumLaneElts;
3821
3822      if (!isUndefOrEqual(Idx, Start+i))
3823        return false;
3824
3825    }
3826  }
3827
3828  return true;
3829}
3830
3831/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3832/// the two vector operands have swapped position.
3833static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3834                                     unsigned NumElems) {
3835  for (unsigned i = 0; i != NumElems; ++i) {
3836    int idx = Mask[i];
3837    if (idx < 0)
3838      continue;
3839    else if (idx < (int)NumElems)
3840      Mask[i] = idx + NumElems;
3841    else
3842      Mask[i] = idx - NumElems;
3843  }
3844}
3845
3846/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3847/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3848/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3849/// reverse of what x86 shuffles want.
3850static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
3851
3852  unsigned NumElems = VT.getVectorNumElements();
3853  unsigned NumLanes = VT.getSizeInBits()/128;
3854  unsigned NumLaneElems = NumElems/NumLanes;
3855
3856  if (NumLaneElems != 2 && NumLaneElems != 4)
3857    return false;
3858
3859  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
3860  bool symetricMaskRequired =
3861    (VT.getSizeInBits() >= 256) && (EltSize == 32);
3862
3863  // VSHUFPSY divides the resulting vector into 4 chunks.
3864  // The sources are also splitted into 4 chunks, and each destination
3865  // chunk must come from a different source chunk.
3866  //
3867  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3868  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3869  //
3870  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3871  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3872  //
3873  // VSHUFPDY divides the resulting vector into 4 chunks.
3874  // The sources are also splitted into 4 chunks, and each destination
3875  // chunk must come from a different source chunk.
3876  //
3877  //  SRC1 =>      X3       X2       X1       X0
3878  //  SRC2 =>      Y3       Y2       Y1       Y0
3879  //
3880  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3881  //
3882  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
3883  unsigned HalfLaneElems = NumLaneElems/2;
3884  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3885    for (unsigned i = 0; i != NumLaneElems; ++i) {
3886      int Idx = Mask[i+l];
3887      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3888      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3889        return false;
3890      // For VSHUFPSY, the mask of the second half must be the same as the
3891      // first but with the appropriate offsets. This works in the same way as
3892      // VPERMILPS works with masks.
3893      if (!symetricMaskRequired || Idx < 0)
3894        continue;
3895      if (MaskVal[i] < 0) {
3896        MaskVal[i] = Idx - l;
3897        continue;
3898      }
3899      if ((signed)(Idx - l) != MaskVal[i])
3900        return false;
3901    }
3902  }
3903
3904  return true;
3905}
3906
3907/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3908/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3909static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
3910  if (!VT.is128BitVector())
3911    return false;
3912
3913  unsigned NumElems = VT.getVectorNumElements();
3914
3915  if (NumElems != 4)
3916    return false;
3917
3918  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3919  return isUndefOrEqual(Mask[0], 6) &&
3920         isUndefOrEqual(Mask[1], 7) &&
3921         isUndefOrEqual(Mask[2], 2) &&
3922         isUndefOrEqual(Mask[3], 3);
3923}
3924
3925/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3926/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3927/// <2, 3, 2, 3>
3928static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
3929  if (!VT.is128BitVector())
3930    return false;
3931
3932  unsigned NumElems = VT.getVectorNumElements();
3933
3934  if (NumElems != 4)
3935    return false;
3936
3937  return isUndefOrEqual(Mask[0], 2) &&
3938         isUndefOrEqual(Mask[1], 3) &&
3939         isUndefOrEqual(Mask[2], 2) &&
3940         isUndefOrEqual(Mask[3], 3);
3941}
3942
3943/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3944/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3945static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
3946  if (!VT.is128BitVector())
3947    return false;
3948
3949  unsigned NumElems = VT.getVectorNumElements();
3950
3951  if (NumElems != 2 && NumElems != 4)
3952    return false;
3953
3954  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3955    if (!isUndefOrEqual(Mask[i], i + NumElems))
3956      return false;
3957
3958  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3959    if (!isUndefOrEqual(Mask[i], i))
3960      return false;
3961
3962  return true;
3963}
3964
3965/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3966/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3967static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
3968  if (!VT.is128BitVector())
3969    return false;
3970
3971  unsigned NumElems = VT.getVectorNumElements();
3972
3973  if (NumElems != 2 && NumElems != 4)
3974    return false;
3975
3976  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3977    if (!isUndefOrEqual(Mask[i], i))
3978      return false;
3979
3980  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3981    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3982      return false;
3983
3984  return true;
3985}
3986
3987/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
3988/// specifies a shuffle of elements that is suitable for input to INSERTPS.
3989/// i. e: If all but one element come from the same vector.
3990static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
3991  // TODO: Deal with AVX's VINSERTPS
3992  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
3993    return false;
3994
3995  unsigned CorrectPosV1 = 0;
3996  unsigned CorrectPosV2 = 0;
3997  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
3998    if (Mask[i] == -1) {
3999      ++CorrectPosV1;
4000      ++CorrectPosV2;
4001      continue;
4002    }
4003
4004    if (Mask[i] == i)
4005      ++CorrectPosV1;
4006    else if (Mask[i] == i + 4)
4007      ++CorrectPosV2;
4008  }
4009
4010  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4011    // We have 3 elements (undefs count as elements from any vector) from one
4012    // vector, and one from another.
4013    return true;
4014
4015  return false;
4016}
4017
4018//
4019// Some special combinations that can be optimized.
4020//
4021static
4022SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4023                               SelectionDAG &DAG) {
4024  MVT VT = SVOp->getSimpleValueType(0);
4025  SDLoc dl(SVOp);
4026
4027  if (VT != MVT::v8i32 && VT != MVT::v8f32)
4028    return SDValue();
4029
4030  ArrayRef<int> Mask = SVOp->getMask();
4031
4032  // These are the special masks that may be optimized.
4033  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4034  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4035  bool MatchEvenMask = true;
4036  bool MatchOddMask  = true;
4037  for (int i=0; i<8; ++i) {
4038    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4039      MatchEvenMask = false;
4040    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4041      MatchOddMask = false;
4042  }
4043
4044  if (!MatchEvenMask && !MatchOddMask)
4045    return SDValue();
4046
4047  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4048
4049  SDValue Op0 = SVOp->getOperand(0);
4050  SDValue Op1 = SVOp->getOperand(1);
4051
4052  if (MatchEvenMask) {
4053    // Shift the second operand right to 32 bits.
4054    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4055    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4056  } else {
4057    // Shift the first operand left to 32 bits.
4058    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4059    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4060  }
4061  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4062  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4063}
4064
4065/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4066/// specifies a shuffle of elements that is suitable for input to UNPCKL.
4067static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4068                         bool HasInt256, bool V2IsSplat = false) {
4069
4070  assert(VT.getSizeInBits() >= 128 &&
4071         "Unsupported vector type for unpckl");
4072
4073  // AVX defines UNPCK* to operate independently on 128-bit lanes.
4074  unsigned NumLanes;
4075  unsigned NumOf256BitLanes;
4076  unsigned NumElts = VT.getVectorNumElements();
4077  if (VT.is256BitVector()) {
4078    if (NumElts != 4 && NumElts != 8 &&
4079        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4080    return false;
4081    NumLanes = 2;
4082    NumOf256BitLanes = 1;
4083  } else if (VT.is512BitVector()) {
4084    assert(VT.getScalarType().getSizeInBits() >= 32 &&
4085           "Unsupported vector type for unpckh");
4086    NumLanes = 2;
4087    NumOf256BitLanes = 2;
4088  } else {
4089    NumLanes = 1;
4090    NumOf256BitLanes = 1;
4091  }
4092
4093  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
4094  unsigned NumLaneElts = NumEltsInStride/NumLanes;
4095
4096  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
4097    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
4098      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4099        int BitI  = Mask[l256*NumEltsInStride+l+i];
4100        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
4101        if (!isUndefOrEqual(BitI, j+l256*NumElts))
4102          return false;
4103        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
4104          return false;
4105        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
4106          return false;
4107      }
4108    }
4109  }
4110  return true;
4111}
4112
4113/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4114/// specifies a shuffle of elements that is suitable for input to UNPCKH.
4115static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4116                         bool HasInt256, bool V2IsSplat = false) {
4117  assert(VT.getSizeInBits() >= 128 &&
4118         "Unsupported vector type for unpckh");
4119
4120  // AVX defines UNPCK* to operate independently on 128-bit lanes.
4121  unsigned NumLanes;
4122  unsigned NumOf256BitLanes;
4123  unsigned NumElts = VT.getVectorNumElements();
4124  if (VT.is256BitVector()) {
4125    if (NumElts != 4 && NumElts != 8 &&
4126        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4127    return false;
4128    NumLanes = 2;
4129    NumOf256BitLanes = 1;
4130  } else if (VT.is512BitVector()) {
4131    assert(VT.getScalarType().getSizeInBits() >= 32 &&
4132           "Unsupported vector type for unpckh");
4133    NumLanes = 2;
4134    NumOf256BitLanes = 2;
4135  } else {
4136    NumLanes = 1;
4137    NumOf256BitLanes = 1;
4138  }
4139
4140  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
4141  unsigned NumLaneElts = NumEltsInStride/NumLanes;
4142
4143  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
4144    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
4145      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4146        int BitI  = Mask[l256*NumEltsInStride+l+i];
4147        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
4148        if (!isUndefOrEqual(BitI, j+l256*NumElts))
4149          return false;
4150        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
4151          return false;
4152        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
4153          return false;
4154      }
4155    }
4156  }
4157  return true;
4158}
4159
4160/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4161/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4162/// <0, 0, 1, 1>
4163static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4164  unsigned NumElts = VT.getVectorNumElements();
4165  bool Is256BitVec = VT.is256BitVector();
4166
4167  if (VT.is512BitVector())
4168    return false;
4169  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4170         "Unsupported vector type for unpckh");
4171
4172  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4173      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4174    return false;
4175
4176  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4177  // FIXME: Need a better way to get rid of this, there's no latency difference
4178  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4179  // the former later. We should also remove the "_undef" special mask.
4180  if (NumElts == 4 && Is256BitVec)
4181    return false;
4182
4183  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4184  // independently on 128-bit lanes.
4185  unsigned NumLanes = VT.getSizeInBits()/128;
4186  unsigned NumLaneElts = NumElts/NumLanes;
4187
4188  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4189    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4190      int BitI  = Mask[l+i];
4191      int BitI1 = Mask[l+i+1];
4192
4193      if (!isUndefOrEqual(BitI, j))
4194        return false;
4195      if (!isUndefOrEqual(BitI1, j))
4196        return false;
4197    }
4198  }
4199
4200  return true;
4201}
4202
4203/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4204/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4205/// <2, 2, 3, 3>
4206static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4207  unsigned NumElts = VT.getVectorNumElements();
4208
4209  if (VT.is512BitVector())
4210    return false;
4211
4212  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4213         "Unsupported vector type for unpckh");
4214
4215  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4216      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4217    return false;
4218
4219  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4220  // independently on 128-bit lanes.
4221  unsigned NumLanes = VT.getSizeInBits()/128;
4222  unsigned NumLaneElts = NumElts/NumLanes;
4223
4224  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4225    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4226      int BitI  = Mask[l+i];
4227      int BitI1 = Mask[l+i+1];
4228      if (!isUndefOrEqual(BitI, j))
4229        return false;
4230      if (!isUndefOrEqual(BitI1, j))
4231        return false;
4232    }
4233  }
4234  return true;
4235}
4236
4237// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4238// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4239static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4240  if (!VT.is512BitVector())
4241    return false;
4242
4243  unsigned NumElts = VT.getVectorNumElements();
4244  unsigned HalfSize = NumElts/2;
4245  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4246    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4247      *Imm = 1;
4248      return true;
4249    }
4250  }
4251  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4252    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4253      *Imm = 0;
4254      return true;
4255    }
4256  }
4257  return false;
4258}
4259
4260/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4261/// specifies a shuffle of elements that is suitable for input to MOVSS,
4262/// MOVSD, and MOVD, i.e. setting the lowest element.
4263static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4264  if (VT.getVectorElementType().getSizeInBits() < 32)
4265    return false;
4266  if (!VT.is128BitVector())
4267    return false;
4268
4269  unsigned NumElts = VT.getVectorNumElements();
4270
4271  if (!isUndefOrEqual(Mask[0], NumElts))
4272    return false;
4273
4274  for (unsigned i = 1; i != NumElts; ++i)
4275    if (!isUndefOrEqual(Mask[i], i))
4276      return false;
4277
4278  return true;
4279}
4280
4281/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4282/// as permutations between 128-bit chunks or halves. As an example: this
4283/// shuffle bellow:
4284///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4285/// The first half comes from the second half of V1 and the second half from the
4286/// the second half of V2.
4287static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4288  if (!HasFp256 || !VT.is256BitVector())
4289    return false;
4290
4291  // The shuffle result is divided into half A and half B. In total the two
4292  // sources have 4 halves, namely: C, D, E, F. The final values of A and
4293  // B must come from C, D, E or F.
4294  unsigned HalfSize = VT.getVectorNumElements()/2;
4295  bool MatchA = false, MatchB = false;
4296
4297  // Check if A comes from one of C, D, E, F.
4298  for (unsigned Half = 0; Half != 4; ++Half) {
4299    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4300      MatchA = true;
4301      break;
4302    }
4303  }
4304
4305  // Check if B comes from one of C, D, E, F.
4306  for (unsigned Half = 0; Half != 4; ++Half) {
4307    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4308      MatchB = true;
4309      break;
4310    }
4311  }
4312
4313  return MatchA && MatchB;
4314}
4315
4316/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4317/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4318static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4319  MVT VT = SVOp->getSimpleValueType(0);
4320
4321  unsigned HalfSize = VT.getVectorNumElements()/2;
4322
4323  unsigned FstHalf = 0, SndHalf = 0;
4324  for (unsigned i = 0; i < HalfSize; ++i) {
4325    if (SVOp->getMaskElt(i) > 0) {
4326      FstHalf = SVOp->getMaskElt(i)/HalfSize;
4327      break;
4328    }
4329  }
4330  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4331    if (SVOp->getMaskElt(i) > 0) {
4332      SndHalf = SVOp->getMaskElt(i)/HalfSize;
4333      break;
4334    }
4335  }
4336
4337  return (FstHalf | (SndHalf << 4));
4338}
4339
4340// Symetric in-lane mask. Each lane has 4 elements (for imm8)
4341static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4342  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4343  if (EltSize < 32)
4344    return false;
4345
4346  unsigned NumElts = VT.getVectorNumElements();
4347  Imm8 = 0;
4348  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4349    for (unsigned i = 0; i != NumElts; ++i) {
4350      if (Mask[i] < 0)
4351        continue;
4352      Imm8 |= Mask[i] << (i*2);
4353    }
4354    return true;
4355  }
4356
4357  unsigned LaneSize = 4;
4358  SmallVector<int, 4> MaskVal(LaneSize, -1);
4359
4360  for (unsigned l = 0; l != NumElts; l += LaneSize) {
4361    for (unsigned i = 0; i != LaneSize; ++i) {
4362      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4363        return false;
4364      if (Mask[i+l] < 0)
4365        continue;
4366      if (MaskVal[i] < 0) {
4367        MaskVal[i] = Mask[i+l] - l;
4368        Imm8 |= MaskVal[i] << (i*2);
4369        continue;
4370      }
4371      if (Mask[i+l] != (signed)(MaskVal[i]+l))
4372        return false;
4373    }
4374  }
4375  return true;
4376}
4377
4378/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4379/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4380/// Note that VPERMIL mask matching is different depending whether theunderlying
4381/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4382/// to the same elements of the low, but to the higher half of the source.
4383/// In VPERMILPD the two lanes could be shuffled independently of each other
4384/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4385static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4386  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4387  if (VT.getSizeInBits() < 256 || EltSize < 32)
4388    return false;
4389  bool symetricMaskRequired = (EltSize == 32);
4390  unsigned NumElts = VT.getVectorNumElements();
4391
4392  unsigned NumLanes = VT.getSizeInBits()/128;
4393  unsigned LaneSize = NumElts/NumLanes;
4394  // 2 or 4 elements in one lane
4395
4396  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4397  for (unsigned l = 0; l != NumElts; l += LaneSize) {
4398    for (unsigned i = 0; i != LaneSize; ++i) {
4399      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4400        return false;
4401      if (symetricMaskRequired) {
4402        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4403          ExpectedMaskVal[i] = Mask[i+l] - l;
4404          continue;
4405        }
4406        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4407          return false;
4408      }
4409    }
4410  }
4411  return true;
4412}
4413
4414/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4415/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4416/// element of vector 2 and the other elements to come from vector 1 in order.
4417static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4418                               bool V2IsSplat = false, bool V2IsUndef = false) {
4419  if (!VT.is128BitVector())
4420    return false;
4421
4422  unsigned NumOps = VT.getVectorNumElements();
4423  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4424    return false;
4425
4426  if (!isUndefOrEqual(Mask[0], 0))
4427    return false;
4428
4429  for (unsigned i = 1; i != NumOps; ++i)
4430    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||