X86ISelLowering.cpp revision 946a3a9f22c967d5432eaab5fa464b91343477cd
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86ISelLowering.h"
17#include "Utils/X86ShuffleDecode.h"
18#include "X86.h"
19#include "X86InstrBuilder.h"
20#include "X86TargetMachine.h"
21#include "X86TargetObjectFile.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/Statistic.h"
24#include "llvm/ADT/StringExtras.h"
25#include "llvm/ADT/VariadicFunction.h"
26#include "llvm/CallingConv.h"
27#include "llvm/CodeGen/IntrinsicLowering.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineJumpTableInfo.h"
32#include "llvm/CodeGen/MachineModuleInfo.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/Constants.h"
35#include "llvm/DerivedTypes.h"
36#include "llvm/Function.h"
37#include "llvm/GlobalAlias.h"
38#include "llvm/GlobalVariable.h"
39#include "llvm/Instructions.h"
40#include "llvm/Intrinsics.h"
41#include "llvm/LLVMContext.h"
42#include "llvm/MC/MCAsmInfo.h"
43#include "llvm/MC/MCContext.h"
44#include "llvm/MC/MCExpr.h"
45#include "llvm/MC/MCSymbol.h"
46#include "llvm/Support/CallSite.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/ErrorHandling.h"
49#include "llvm/Support/MathExtras.h"
50#include "llvm/Target/TargetOptions.h"
51#include <bitset>
52#include <cctype>
53using namespace llvm;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57// Forward declarations.
58static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
59                       SDValue V2);
60
61/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
62/// sets things up to match to an AVX VEXTRACTF128 instruction or a
63/// simple subregister reference.  Idx is an index in the 128 bits we
64/// want.  It need not be aligned to a 128-bit bounday.  That makes
65/// lowering EXTRACT_VECTOR_ELT operations easier.
66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
67                                   SelectionDAG &DAG, DebugLoc dl) {
68  EVT VT = Vec.getValueType();
69  assert(VT.is256BitVector() && "Unexpected vector size!");
70  EVT ElVT = VT.getVectorElementType();
71  unsigned Factor = VT.getSizeInBits()/128;
72  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
73                                  VT.getVectorNumElements()/Factor);
74
75  // Extract from UNDEF is UNDEF.
76  if (Vec.getOpcode() == ISD::UNDEF)
77    return DAG.getUNDEF(ResultVT);
78
79  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
80  // we can match to VEXTRACTF128.
81  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
82
83  // This is the index of the first element of the 128-bit chunk
84  // we want.
85  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
86                               * ElemsPerChunk);
87
88  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
89  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
90                               VecIdx);
91
92  return Result;
93}
94
95/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
96/// sets things up to match to an AVX VINSERTF128 instruction or a
97/// simple superregister reference.  Idx is an index in the 128 bits
98/// we want.  It need not be aligned to a 128-bit bounday.  That makes
99/// lowering INSERT_VECTOR_ELT operations easier.
100static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
101                                  unsigned IdxVal, SelectionDAG &DAG,
102                                  DebugLoc dl) {
103  // Inserting UNDEF is Result
104  if (Vec.getOpcode() == ISD::UNDEF)
105    return Result;
106
107  EVT VT = Vec.getValueType();
108  assert(VT.is128BitVector() && "Unexpected vector size!");
109
110  EVT ElVT = VT.getVectorElementType();
111  EVT ResultVT = Result.getValueType();
112
113  // Insert the relevant 128 bits.
114  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
115
116  // This is the index of the first element of the 128-bit chunk
117  // we want.
118  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
119                               * ElemsPerChunk);
120
121  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
122  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
123                     VecIdx);
124}
125
126/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
127/// instructions. This is used because creating CONCAT_VECTOR nodes of
128/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
129/// large BUILD_VECTORS.
130static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
131                                   unsigned NumElems, SelectionDAG &DAG,
132                                   DebugLoc dl) {
133  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
134  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
135}
136
137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
138  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
139  bool is64Bit = Subtarget->is64Bit();
140
141  if (Subtarget->isTargetEnvMacho()) {
142    if (is64Bit)
143      return new X86_64MachoTargetObjectFile();
144    return new TargetLoweringObjectFileMachO();
145  }
146
147  if (Subtarget->isTargetLinux())
148    return new X86LinuxTargetObjectFile();
149  if (Subtarget->isTargetELF())
150    return new TargetLoweringObjectFileELF();
151  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
152    return new TargetLoweringObjectFileCOFF();
153  llvm_unreachable("unknown subtarget type");
154}
155
156X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
157  : TargetLowering(TM, createTLOF(TM)) {
158  Subtarget = &TM.getSubtarget<X86Subtarget>();
159  X86ScalarSSEf64 = Subtarget->hasSSE2();
160  X86ScalarSSEf32 = Subtarget->hasSSE1();
161
162  RegInfo = TM.getRegisterInfo();
163  TD = getDataLayout();
164
165  // Set up the TargetLowering object.
166  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
167
168  // X86 is weird, it always uses i8 for shift amounts and setcc results.
169  setBooleanContents(ZeroOrOneBooleanContent);
170  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
171  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
172
173  // For 64-bit since we have so many registers use the ILP scheduler, for
174  // 32-bit code use the register pressure specific scheduling.
175  // For Atom, always use ILP scheduling.
176  if (Subtarget->isAtom())
177    setSchedulingPreference(Sched::ILP);
178  else if (Subtarget->is64Bit())
179    setSchedulingPreference(Sched::ILP);
180  else
181    setSchedulingPreference(Sched::RegPressure);
182  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
183
184  // Bypass i32 with i8 on Atom when compiling with O2
185  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
186    addBypassSlowDiv(32, 8);
187
188  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
189    // Setup Windows compiler runtime calls.
190    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
191    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
192    setLibcallName(RTLIB::SREM_I64, "_allrem");
193    setLibcallName(RTLIB::UREM_I64, "_aullrem");
194    setLibcallName(RTLIB::MUL_I64, "_allmul");
195    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
196    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
197    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
198    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
199    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
200
201    // The _ftol2 runtime function has an unusual calling conv, which
202    // is modeled by a special pseudo-instruction.
203    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
204    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
205    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
206    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
207  }
208
209  if (Subtarget->isTargetDarwin()) {
210    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
211    setUseUnderscoreSetJmp(false);
212    setUseUnderscoreLongJmp(false);
213  } else if (Subtarget->isTargetMingw()) {
214    // MS runtime is weird: it exports _setjmp, but longjmp!
215    setUseUnderscoreSetJmp(true);
216    setUseUnderscoreLongJmp(false);
217  } else {
218    setUseUnderscoreSetJmp(true);
219    setUseUnderscoreLongJmp(true);
220  }
221
222  // Set up the register classes.
223  addRegisterClass(MVT::i8, &X86::GR8RegClass);
224  addRegisterClass(MVT::i16, &X86::GR16RegClass);
225  addRegisterClass(MVT::i32, &X86::GR32RegClass);
226  if (Subtarget->is64Bit())
227    addRegisterClass(MVT::i64, &X86::GR64RegClass);
228
229  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
230
231  // We don't accept any truncstore of integer registers.
232  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
233  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
234  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
235  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
236  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
237  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
238
239  // SETOEQ and SETUNE require checking two conditions.
240  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
241  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
242  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
243  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
244  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
245  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
246
247  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
248  // operation.
249  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
250  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
251  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
252
253  if (Subtarget->is64Bit()) {
254    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
255    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
256  } else if (!TM.Options.UseSoftFloat) {
257    // We have an algorithm for SSE2->double, and we turn this into a
258    // 64-bit FILD followed by conditional FADD for other targets.
259    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
260    // We have an algorithm for SSE2, and we turn this into a 64-bit
261    // FILD for other targets.
262    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
263  }
264
265  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
266  // this operation.
267  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
268  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
269
270  if (!TM.Options.UseSoftFloat) {
271    // SSE has no i16 to fp conversion, only i32
272    if (X86ScalarSSEf32) {
273      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
274      // f32 and f64 cases are Legal, f80 case is not
275      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
276    } else {
277      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
278      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
279    }
280  } else {
281    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
282    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
283  }
284
285  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
286  // are Legal, f80 is custom lowered.
287  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
288  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
289
290  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
291  // this operation.
292  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
293  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
294
295  if (X86ScalarSSEf32) {
296    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
297    // f32 and f64 cases are Legal, f80 case is not
298    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
299  } else {
300    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
301    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
302  }
303
304  // Handle FP_TO_UINT by promoting the destination to a larger signed
305  // conversion.
306  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
307  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
308  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
309
310  if (Subtarget->is64Bit()) {
311    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
312    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
313  } else if (!TM.Options.UseSoftFloat) {
314    // Since AVX is a superset of SSE3, only check for SSE here.
315    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
316      // Expand FP_TO_UINT into a select.
317      // FIXME: We would like to use a Custom expander here eventually to do
318      // the optimal thing for SSE vs. the default expansion in the legalizer.
319      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
320    else
321      // With SSE3 we can use fisttpll to convert to a signed i64; without
322      // SSE, we're stuck with a fistpll.
323      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
324  }
325
326  if (isTargetFTOL()) {
327    // Use the _ftol2 runtime function, which has a pseudo-instruction
328    // to handle its weird calling convention.
329    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
330  }
331
332  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
333  if (!X86ScalarSSEf64) {
334    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
335    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
336    if (Subtarget->is64Bit()) {
337      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
338      // Without SSE, i64->f64 goes through memory.
339      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
340    }
341  }
342
343  // Scalar integer divide and remainder are lowered to use operations that
344  // produce two results, to match the available instructions. This exposes
345  // the two-result form to trivial CSE, which is able to combine x/y and x%y
346  // into a single instruction.
347  //
348  // Scalar integer multiply-high is also lowered to use two-result
349  // operations, to match the available instructions. However, plain multiply
350  // (low) operations are left as Legal, as there are single-result
351  // instructions for this in x86. Using the two-result multiply instructions
352  // when both high and low results are needed must be arranged by dagcombine.
353  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
354    MVT VT = IntVTs[i];
355    setOperationAction(ISD::MULHS, VT, Expand);
356    setOperationAction(ISD::MULHU, VT, Expand);
357    setOperationAction(ISD::SDIV, VT, Expand);
358    setOperationAction(ISD::UDIV, VT, Expand);
359    setOperationAction(ISD::SREM, VT, Expand);
360    setOperationAction(ISD::UREM, VT, Expand);
361
362    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
363    setOperationAction(ISD::ADDC, VT, Custom);
364    setOperationAction(ISD::ADDE, VT, Custom);
365    setOperationAction(ISD::SUBC, VT, Custom);
366    setOperationAction(ISD::SUBE, VT, Custom);
367  }
368
369  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
370  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
371  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
372  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
373  if (Subtarget->is64Bit())
374    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
375  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
376  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
377  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
378  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
379  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
380  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
381  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
382  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
383
384  // Promote the i8 variants and force them on up to i32 which has a shorter
385  // encoding.
386  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
387  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
388  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
389  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
390  if (Subtarget->hasBMI()) {
391    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
392    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
393    if (Subtarget->is64Bit())
394      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
395  } else {
396    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
397    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
398    if (Subtarget->is64Bit())
399      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
400  }
401
402  if (Subtarget->hasLZCNT()) {
403    // When promoting the i8 variants, force them to i32 for a shorter
404    // encoding.
405    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
406    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
407    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
408    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
409    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
410    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
411    if (Subtarget->is64Bit())
412      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
413  } else {
414    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
415    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
416    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
417    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
418    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
419    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
420    if (Subtarget->is64Bit()) {
421      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
422      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
423    }
424  }
425
426  if (Subtarget->hasPOPCNT()) {
427    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
428  } else {
429    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
430    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
431    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
432    if (Subtarget->is64Bit())
433      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
434  }
435
436  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
437  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
438
439  // These should be promoted to a larger select which is supported.
440  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
441  // X86 wants to expand cmov itself.
442  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
443  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
444  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
445  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
446  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
447  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
448  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
449  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
450  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
451  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
452  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
453  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
454  if (Subtarget->is64Bit()) {
455    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
456    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
457  }
458  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
459  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
460  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
461  // support continuation, user-level threading, and etc.. As a result, no
462  // other SjLj exception interfaces are implemented and please don't build
463  // your own exception handling based on them.
464  // LLVM/Clang supports zero-cost DWARF exception handling.
465  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
466  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
467
468  // Darwin ABI issue.
469  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
470  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
471  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
472  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
473  if (Subtarget->is64Bit())
474    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
475  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
476  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
477  if (Subtarget->is64Bit()) {
478    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
479    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
480    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
481    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
482    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
483  }
484  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
485  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
486  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
487  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
488  if (Subtarget->is64Bit()) {
489    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
490    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
491    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
492  }
493
494  if (Subtarget->hasSSE1())
495    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
496
497  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
498  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
499
500  // On X86 and X86-64, atomic operations are lowered to locked instructions.
501  // Locked instructions, in turn, have implicit fence semantics (all memory
502  // operations are flushed before issuing the locked instruction, and they
503  // are not buffered), so we can fold away the common pattern of
504  // fence-atomic-fence.
505  setShouldFoldAtomicFences(true);
506
507  // Expand certain atomics
508  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
509    MVT VT = IntVTs[i];
510    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
511    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
512    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
513  }
514
515  if (!Subtarget->is64Bit()) {
516    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
517    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
518    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
519    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
520    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
521    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
522    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
523    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
524    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
525    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
526    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
527    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
528  }
529
530  if (Subtarget->hasCmpxchg16b()) {
531    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
532  }
533
534  // FIXME - use subtarget debug flags
535  if (!Subtarget->isTargetDarwin() &&
536      !Subtarget->isTargetELF() &&
537      !Subtarget->isTargetCygMing()) {
538    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
539  }
540
541  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
542  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
543  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
544  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
545  if (Subtarget->is64Bit()) {
546    setExceptionPointerRegister(X86::RAX);
547    setExceptionSelectorRegister(X86::RDX);
548  } else {
549    setExceptionPointerRegister(X86::EAX);
550    setExceptionSelectorRegister(X86::EDX);
551  }
552  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
553  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
554
555  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
556  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
557
558  setOperationAction(ISD::TRAP, MVT::Other, Legal);
559  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
560
561  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
562  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
563  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
564  if (Subtarget->is64Bit()) {
565    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
566    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
567  } else {
568    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
569    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
570  }
571
572  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
573  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
574
575  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
576    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
577                       MVT::i64 : MVT::i32, Custom);
578  else if (TM.Options.EnableSegmentedStacks)
579    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
580                       MVT::i64 : MVT::i32, Custom);
581  else
582    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
583                       MVT::i64 : MVT::i32, Expand);
584
585  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
586    // f32 and f64 use SSE.
587    // Set up the FP register classes.
588    addRegisterClass(MVT::f32, &X86::FR32RegClass);
589    addRegisterClass(MVT::f64, &X86::FR64RegClass);
590
591    // Use ANDPD to simulate FABS.
592    setOperationAction(ISD::FABS , MVT::f64, Custom);
593    setOperationAction(ISD::FABS , MVT::f32, Custom);
594
595    // Use XORP to simulate FNEG.
596    setOperationAction(ISD::FNEG , MVT::f64, Custom);
597    setOperationAction(ISD::FNEG , MVT::f32, Custom);
598
599    // Use ANDPD and ORPD to simulate FCOPYSIGN.
600    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
601    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
602
603    // Lower this to FGETSIGNx86 plus an AND.
604    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
605    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
606
607    // We don't support sin/cos/fmod
608    setOperationAction(ISD::FSIN , MVT::f64, Expand);
609    setOperationAction(ISD::FCOS , MVT::f64, Expand);
610    setOperationAction(ISD::FSIN , MVT::f32, Expand);
611    setOperationAction(ISD::FCOS , MVT::f32, Expand);
612
613    // Expand FP immediates into loads from the stack, except for the special
614    // cases we handle.
615    addLegalFPImmediate(APFloat(+0.0)); // xorpd
616    addLegalFPImmediate(APFloat(+0.0f)); // xorps
617  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
618    // Use SSE for f32, x87 for f64.
619    // Set up the FP register classes.
620    addRegisterClass(MVT::f32, &X86::FR32RegClass);
621    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
622
623    // Use ANDPS to simulate FABS.
624    setOperationAction(ISD::FABS , MVT::f32, Custom);
625
626    // Use XORP to simulate FNEG.
627    setOperationAction(ISD::FNEG , MVT::f32, Custom);
628
629    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
630
631    // Use ANDPS and ORPS to simulate FCOPYSIGN.
632    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
633    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
634
635    // We don't support sin/cos/fmod
636    setOperationAction(ISD::FSIN , MVT::f32, Expand);
637    setOperationAction(ISD::FCOS , MVT::f32, Expand);
638
639    // Special cases we handle for FP constants.
640    addLegalFPImmediate(APFloat(+0.0f)); // xorps
641    addLegalFPImmediate(APFloat(+0.0)); // FLD0
642    addLegalFPImmediate(APFloat(+1.0)); // FLD1
643    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
644    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
645
646    if (!TM.Options.UnsafeFPMath) {
647      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
648      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
649    }
650  } else if (!TM.Options.UseSoftFloat) {
651    // f32 and f64 in x87.
652    // Set up the FP register classes.
653    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
654    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
655
656    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
657    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
658    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
659    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
660
661    if (!TM.Options.UnsafeFPMath) {
662      setOperationAction(ISD::FSIN           , MVT::f32  , Expand);
663      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
664      setOperationAction(ISD::FCOS           , MVT::f32  , Expand);
665      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
666    }
667    addLegalFPImmediate(APFloat(+0.0)); // FLD0
668    addLegalFPImmediate(APFloat(+1.0)); // FLD1
669    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
670    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
671    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
672    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
673    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
674    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
675  }
676
677  // We don't support FMA.
678  setOperationAction(ISD::FMA, MVT::f64, Expand);
679  setOperationAction(ISD::FMA, MVT::f32, Expand);
680
681  // Long double always uses X87.
682  if (!TM.Options.UseSoftFloat) {
683    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
684    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
685    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
686    {
687      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
688      addLegalFPImmediate(TmpFlt);  // FLD0
689      TmpFlt.changeSign();
690      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
691
692      bool ignored;
693      APFloat TmpFlt2(+1.0);
694      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
695                      &ignored);
696      addLegalFPImmediate(TmpFlt2);  // FLD1
697      TmpFlt2.changeSign();
698      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
699    }
700
701    if (!TM.Options.UnsafeFPMath) {
702      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
703      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
704    }
705
706    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
707    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
708    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
709    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
710    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
711    setOperationAction(ISD::FMA, MVT::f80, Expand);
712  }
713
714  // Always use a library call for pow.
715  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
716  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
717  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
718
719  setOperationAction(ISD::FLOG, MVT::f80, Expand);
720  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
721  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
722  setOperationAction(ISD::FEXP, MVT::f80, Expand);
723  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
724
725  // First set operation action for all vector types to either promote
726  // (for widening) or expand (for scalarization). Then we will selectively
727  // turn on ones that can be effectively codegen'd.
728  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
729           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
730    MVT VT = (MVT::SimpleValueType)i;
731    setOperationAction(ISD::ADD , VT, Expand);
732    setOperationAction(ISD::SUB , VT, Expand);
733    setOperationAction(ISD::FADD, VT, Expand);
734    setOperationAction(ISD::FNEG, VT, Expand);
735    setOperationAction(ISD::FSUB, VT, Expand);
736    setOperationAction(ISD::MUL , VT, Expand);
737    setOperationAction(ISD::FMUL, VT, Expand);
738    setOperationAction(ISD::SDIV, VT, Expand);
739    setOperationAction(ISD::UDIV, VT, Expand);
740    setOperationAction(ISD::FDIV, VT, Expand);
741    setOperationAction(ISD::SREM, VT, Expand);
742    setOperationAction(ISD::UREM, VT, Expand);
743    setOperationAction(ISD::LOAD, VT, Expand);
744    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
745    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
746    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
747    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
748    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
749    setOperationAction(ISD::FABS, VT, Expand);
750    setOperationAction(ISD::FSIN, VT, Expand);
751    setOperationAction(ISD::FCOS, VT, Expand);
752    setOperationAction(ISD::FREM, VT, Expand);
753    setOperationAction(ISD::FMA,  VT, Expand);
754    setOperationAction(ISD::FPOWI, VT, Expand);
755    setOperationAction(ISD::FSQRT, VT, Expand);
756    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
757    setOperationAction(ISD::FFLOOR, VT, Expand);
758    setOperationAction(ISD::FCEIL, VT, Expand);
759    setOperationAction(ISD::FTRUNC, VT, Expand);
760    setOperationAction(ISD::FRINT, VT, Expand);
761    setOperationAction(ISD::FNEARBYINT, VT, Expand);
762    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
763    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
764    setOperationAction(ISD::SDIVREM, VT, Expand);
765    setOperationAction(ISD::UDIVREM, VT, Expand);
766    setOperationAction(ISD::FPOW, VT, Expand);
767    setOperationAction(ISD::CTPOP, VT, Expand);
768    setOperationAction(ISD::CTTZ, VT, Expand);
769    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
770    setOperationAction(ISD::CTLZ, VT, Expand);
771    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
772    setOperationAction(ISD::SHL, VT, Expand);
773    setOperationAction(ISD::SRA, VT, Expand);
774    setOperationAction(ISD::SRL, VT, Expand);
775    setOperationAction(ISD::ROTL, VT, Expand);
776    setOperationAction(ISD::ROTR, VT, Expand);
777    setOperationAction(ISD::BSWAP, VT, Expand);
778    setOperationAction(ISD::SETCC, VT, Expand);
779    setOperationAction(ISD::FLOG, VT, Expand);
780    setOperationAction(ISD::FLOG2, VT, Expand);
781    setOperationAction(ISD::FLOG10, VT, Expand);
782    setOperationAction(ISD::FEXP, VT, Expand);
783    setOperationAction(ISD::FEXP2, VT, Expand);
784    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
785    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
786    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
787    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
788    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
789    setOperationAction(ISD::TRUNCATE, VT, Expand);
790    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
791    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
792    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
793    setOperationAction(ISD::VSELECT, VT, Expand);
794    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
795             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
796      setTruncStoreAction(VT,
797                          (MVT::SimpleValueType)InnerVT, Expand);
798    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
799    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
800    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
801  }
802
803  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
804  // with -msoft-float, disable use of MMX as well.
805  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
806    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
807    // No operations on x86mmx supported, everything uses intrinsics.
808  }
809
810  // MMX-sized vectors (other than x86mmx) are expected to be expanded
811  // into smaller operations.
812  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
813  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
814  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
815  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
816  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
817  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
818  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
819  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
820  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
821  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
822  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
823  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
824  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
825  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
826  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
827  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
828  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
829  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
830  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
831  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
832  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
833  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
834  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
835  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
836  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
837  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
838  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
839  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
840  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
841
842  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
843    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
844
845    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
846    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
847    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
848    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
849    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
850    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
851    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
852    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
853    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
854    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
855    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
856    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
857  }
858
859  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
860    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
861
862    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
863    // registers cannot be used even for integer operations.
864    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
865    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
866    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
867    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
868
869    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
870    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
871    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
872    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
873    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
874    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
875    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
876    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
877    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
878    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
879    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
880    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
881    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
882    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
883    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
884    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
885    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
886
887    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
888    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
889    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
890    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
891
892    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
893    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
894    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
895    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
896    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
897
898    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
899    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
900      MVT VT = (MVT::SimpleValueType)i;
901      // Do not attempt to custom lower non-power-of-2 vectors
902      if (!isPowerOf2_32(VT.getVectorNumElements()))
903        continue;
904      // Do not attempt to custom lower non-128-bit vectors
905      if (!VT.is128BitVector())
906        continue;
907      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
908      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
909      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
910    }
911
912    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
913    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
914    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
915    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
916    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
917    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
918
919    if (Subtarget->is64Bit()) {
920      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
921      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
922    }
923
924    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
925    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
926      MVT VT = (MVT::SimpleValueType)i;
927
928      // Do not attempt to promote non-128-bit vectors
929      if (!VT.is128BitVector())
930        continue;
931
932      setOperationAction(ISD::AND,    VT, Promote);
933      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
934      setOperationAction(ISD::OR,     VT, Promote);
935      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
936      setOperationAction(ISD::XOR,    VT, Promote);
937      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
938      setOperationAction(ISD::LOAD,   VT, Promote);
939      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
940      setOperationAction(ISD::SELECT, VT, Promote);
941      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
942    }
943
944    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
945
946    // Custom lower v2i64 and v2f64 selects.
947    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
948    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
949    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
950    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
951
952    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
953    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
954
955    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
956    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
957    // As there is no 64-bit GPR available, we need build a special custom
958    // sequence to convert from v2i32 to v2f32.
959    if (!Subtarget->is64Bit())
960      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
961
962    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
963    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
964
965    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
966  }
967
968  if (Subtarget->hasSSE41()) {
969    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
970    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
971    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
972    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
973    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
974    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
975    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
976    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
977    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
978    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
979
980    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
981    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
982    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
983    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
984    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
985    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
986    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
987    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
988    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
989    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
990
991    // FIXME: Do we need to handle scalar-to-vector here?
992    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
993
994    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
995    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
996    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
997    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
998    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
999
1000    // i8 and i16 vectors are custom , because the source register and source
1001    // source memory operand types are not the same width.  f32 vectors are
1002    // custom since the immediate controlling the insert encodes additional
1003    // information.
1004    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1005    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1006    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1007    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1008
1009    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1010    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1011    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1012    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1013
1014    // FIXME: these should be Legal but thats only for the case where
1015    // the index is constant.  For now custom expand to deal with that.
1016    if (Subtarget->is64Bit()) {
1017      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1018      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1019    }
1020  }
1021
1022  if (Subtarget->hasSSE2()) {
1023    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1024    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1025
1026    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1027    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1028
1029    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1030    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1031
1032    if (Subtarget->hasInt256()) {
1033      setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
1034      setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
1035
1036      setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
1037      setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
1038
1039      setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
1040    } else {
1041      setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
1042      setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
1043
1044      setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
1045      setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
1046
1047      setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
1048    }
1049  }
1050
1051  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1052    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1053    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1054    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1055    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1056    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1057    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1058
1059    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1060    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1061    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1062
1063    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1064    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1065    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1066    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1067    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1068    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1069    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1070    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1071    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1072    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1073    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1074    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1075
1076    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1077    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1078    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1079    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1080    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1081    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1082    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1083    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1084    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1085    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1086    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1087    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1088
1089    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
1090
1091    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
1092
1093    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1094    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1095    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1096
1097    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
1098    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1099    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1100
1101    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
1102
1103    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1104    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1105
1106    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1107    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1108
1109    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1110    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1111
1112    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1113    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1114    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1115    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1116
1117    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1118    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1119    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1120
1121    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
1122    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
1123    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
1124    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
1125
1126    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1127      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1128      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1129      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1130      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1131      setOperationAction(ISD::FMA,             MVT::f32, Legal);
1132      setOperationAction(ISD::FMA,             MVT::f64, Legal);
1133    }
1134
1135    if (Subtarget->hasInt256()) {
1136      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1137      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1138      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1139      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1140
1141      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1142      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1143      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1144      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1145
1146      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1147      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1148      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1149      // Don't lower v32i8 because there is no 128-bit byte mul
1150
1151      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1152
1153      setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
1154      setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
1155
1156      setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
1157      setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
1158
1159      setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
1160    } else {
1161      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1162      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1163      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1164      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1165
1166      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1167      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1168      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1169      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1170
1171      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1172      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1173      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1174      // Don't lower v32i8 because there is no 128-bit byte mul
1175
1176      setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
1177      setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
1178
1179      setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
1180      setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
1181
1182      setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
1183    }
1184
1185    // Custom lower several nodes for 256-bit types.
1186    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1187             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1188      MVT VT = (MVT::SimpleValueType)i;
1189
1190      // Extract subvector is special because the value type
1191      // (result) is 128-bit but the source is 256-bit wide.
1192      if (VT.is128BitVector())
1193        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1194
1195      // Do not attempt to custom lower other non-256-bit vectors
1196      if (!VT.is256BitVector())
1197        continue;
1198
1199      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1200      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1201      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1202      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1203      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1204      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1205      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1206    }
1207
1208    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1209    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1210      MVT VT = (MVT::SimpleValueType)i;
1211
1212      // Do not attempt to promote non-256-bit vectors
1213      if (!VT.is256BitVector())
1214        continue;
1215
1216      setOperationAction(ISD::AND,    VT, Promote);
1217      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1218      setOperationAction(ISD::OR,     VT, Promote);
1219      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1220      setOperationAction(ISD::XOR,    VT, Promote);
1221      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1222      setOperationAction(ISD::LOAD,   VT, Promote);
1223      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1224      setOperationAction(ISD::SELECT, VT, Promote);
1225      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1226    }
1227  }
1228
1229  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1230  // of this type with custom code.
1231  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1232           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1233    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1234                       Custom);
1235  }
1236
1237  // We want to custom lower some of our intrinsics.
1238  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1239  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1240
1241
1242  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1243  // handle type legalization for these operations here.
1244  //
1245  // FIXME: We really should do custom legalization for addition and
1246  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1247  // than generic legalization for 64-bit multiplication-with-overflow, though.
1248  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1249    // Add/Sub/Mul with overflow operations are custom lowered.
1250    MVT VT = IntVTs[i];
1251    setOperationAction(ISD::SADDO, VT, Custom);
1252    setOperationAction(ISD::UADDO, VT, Custom);
1253    setOperationAction(ISD::SSUBO, VT, Custom);
1254    setOperationAction(ISD::USUBO, VT, Custom);
1255    setOperationAction(ISD::SMULO, VT, Custom);
1256    setOperationAction(ISD::UMULO, VT, Custom);
1257  }
1258
1259  // There are no 8-bit 3-address imul/mul instructions
1260  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1261  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1262
1263  if (!Subtarget->is64Bit()) {
1264    // These libcalls are not available in 32-bit.
1265    setLibcallName(RTLIB::SHL_I128, 0);
1266    setLibcallName(RTLIB::SRL_I128, 0);
1267    setLibcallName(RTLIB::SRA_I128, 0);
1268  }
1269
1270  // We have target-specific dag combine patterns for the following nodes:
1271  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1272  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1273  setTargetDAGCombine(ISD::VSELECT);
1274  setTargetDAGCombine(ISD::SELECT);
1275  setTargetDAGCombine(ISD::SHL);
1276  setTargetDAGCombine(ISD::SRA);
1277  setTargetDAGCombine(ISD::SRL);
1278  setTargetDAGCombine(ISD::OR);
1279  setTargetDAGCombine(ISD::AND);
1280  setTargetDAGCombine(ISD::ADD);
1281  setTargetDAGCombine(ISD::FADD);
1282  setTargetDAGCombine(ISD::FSUB);
1283  setTargetDAGCombine(ISD::FMA);
1284  setTargetDAGCombine(ISD::SUB);
1285  setTargetDAGCombine(ISD::LOAD);
1286  setTargetDAGCombine(ISD::STORE);
1287  setTargetDAGCombine(ISD::ZERO_EXTEND);
1288  setTargetDAGCombine(ISD::ANY_EXTEND);
1289  setTargetDAGCombine(ISD::SIGN_EXTEND);
1290  setTargetDAGCombine(ISD::TRUNCATE);
1291  setTargetDAGCombine(ISD::SINT_TO_FP);
1292  setTargetDAGCombine(ISD::SETCC);
1293  if (Subtarget->is64Bit())
1294    setTargetDAGCombine(ISD::MUL);
1295  setTargetDAGCombine(ISD::XOR);
1296
1297  computeRegisterProperties();
1298
1299  // On Darwin, -Os means optimize for size without hurting performance,
1300  // do not reduce the limit.
1301  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1302  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1303  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1304  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1305  maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1306  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1307  setPrefLoopAlignment(4); // 2^4 bytes.
1308  benefitFromCodePlacementOpt = true;
1309
1310  // Predictable cmov don't hurt on atom because it's in-order.
1311  predictableSelectIsExpensive = !Subtarget->isAtom();
1312
1313  setPrefFunctionAlignment(4); // 2^4 bytes.
1314}
1315
1316
1317EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
1318  if (!VT.isVector()) return MVT::i8;
1319  return VT.changeVectorElementTypeToInteger();
1320}
1321
1322
1323/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1324/// the desired ByVal argument alignment.
1325static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1326  if (MaxAlign == 16)
1327    return;
1328  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1329    if (VTy->getBitWidth() == 128)
1330      MaxAlign = 16;
1331  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1332    unsigned EltAlign = 0;
1333    getMaxByValAlign(ATy->getElementType(), EltAlign);
1334    if (EltAlign > MaxAlign)
1335      MaxAlign = EltAlign;
1336  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1337    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1338      unsigned EltAlign = 0;
1339      getMaxByValAlign(STy->getElementType(i), EltAlign);
1340      if (EltAlign > MaxAlign)
1341        MaxAlign = EltAlign;
1342      if (MaxAlign == 16)
1343        break;
1344    }
1345  }
1346}
1347
1348/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1349/// function arguments in the caller parameter area. For X86, aggregates
1350/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1351/// are at 4-byte boundaries.
1352unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1353  if (Subtarget->is64Bit()) {
1354    // Max of 8 and alignment of type.
1355    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1356    if (TyAlign > 8)
1357      return TyAlign;
1358    return 8;
1359  }
1360
1361  unsigned Align = 4;
1362  if (Subtarget->hasSSE1())
1363    getMaxByValAlign(Ty, Align);
1364  return Align;
1365}
1366
1367/// getOptimalMemOpType - Returns the target specific optimal type for load
1368/// and store operations as a result of memset, memcpy, and memmove
1369/// lowering. If DstAlign is zero that means it's safe to destination
1370/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1371/// means there isn't a need to check it against alignment requirement,
1372/// probably because the source does not need to be loaded. If 'IsMemset' is
1373/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1374/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1375/// source is constant so it does not need to be loaded.
1376/// It returns EVT::Other if the type should be determined using generic
1377/// target-independent logic.
1378EVT
1379X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1380                                       unsigned DstAlign, unsigned SrcAlign,
1381                                       bool IsMemset, bool ZeroMemset,
1382                                       bool MemcpyStrSrc,
1383                                       MachineFunction &MF) const {
1384  const Function *F = MF.getFunction();
1385  if ((!IsMemset || ZeroMemset) &&
1386      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
1387    if (Size >= 16 &&
1388        (Subtarget->isUnalignedMemAccessFast() ||
1389         ((DstAlign == 0 || DstAlign >= 16) &&
1390          (SrcAlign == 0 || SrcAlign >= 16)))) {
1391      if (Size >= 32) {
1392        if (Subtarget->hasInt256())
1393          return MVT::v8i32;
1394        if (Subtarget->hasFp256())
1395          return MVT::v8f32;
1396      }
1397      if (Subtarget->hasSSE2())
1398        return MVT::v4i32;
1399      if (Subtarget->hasSSE1())
1400        return MVT::v4f32;
1401    } else if (!MemcpyStrSrc && Size >= 8 &&
1402               !Subtarget->is64Bit() &&
1403               Subtarget->hasSSE2()) {
1404      // Do not use f64 to lower memcpy if source is string constant. It's
1405      // better to use i32 to avoid the loads.
1406      return MVT::f64;
1407    }
1408  }
1409  if (Subtarget->is64Bit() && Size >= 8)
1410    return MVT::i64;
1411  return MVT::i32;
1412}
1413
1414bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1415  if (VT == MVT::f32)
1416    return X86ScalarSSEf32;
1417  else if (VT == MVT::f64)
1418    return X86ScalarSSEf64;
1419  return true;
1420}
1421
1422bool
1423X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
1424  if (Fast)
1425    *Fast = Subtarget->isUnalignedMemAccessFast();
1426  return true;
1427}
1428
1429/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1430/// current function.  The returned value is a member of the
1431/// MachineJumpTableInfo::JTEntryKind enum.
1432unsigned X86TargetLowering::getJumpTableEncoding() const {
1433  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1434  // symbol.
1435  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1436      Subtarget->isPICStyleGOT())
1437    return MachineJumpTableInfo::EK_Custom32;
1438
1439  // Otherwise, use the normal jump table encoding heuristics.
1440  return TargetLowering::getJumpTableEncoding();
1441}
1442
1443const MCExpr *
1444X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1445                                             const MachineBasicBlock *MBB,
1446                                             unsigned uid,MCContext &Ctx) const{
1447  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1448         Subtarget->isPICStyleGOT());
1449  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1450  // entries.
1451  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1452                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1453}
1454
1455/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1456/// jumptable.
1457SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1458                                                    SelectionDAG &DAG) const {
1459  if (!Subtarget->is64Bit())
1460    // This doesn't have DebugLoc associated with it, but is not really the
1461    // same as a Register.
1462    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1463  return Table;
1464}
1465
1466/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1467/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1468/// MCExpr.
1469const MCExpr *X86TargetLowering::
1470getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1471                             MCContext &Ctx) const {
1472  // X86-64 uses RIP relative addressing based on the jump table label.
1473  if (Subtarget->isPICStyleRIPRel())
1474    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1475
1476  // Otherwise, the reference is relative to the PIC base.
1477  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1478}
1479
1480// FIXME: Why this routine is here? Move to RegInfo!
1481std::pair<const TargetRegisterClass*, uint8_t>
1482X86TargetLowering::findRepresentativeClass(EVT VT) const{
1483  const TargetRegisterClass *RRC = 0;
1484  uint8_t Cost = 1;
1485  switch (VT.getSimpleVT().SimpleTy) {
1486  default:
1487    return TargetLowering::findRepresentativeClass(VT);
1488  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1489    RRC = Subtarget->is64Bit() ?
1490      (const TargetRegisterClass*)&X86::GR64RegClass :
1491      (const TargetRegisterClass*)&X86::GR32RegClass;
1492    break;
1493  case MVT::x86mmx:
1494    RRC = &X86::VR64RegClass;
1495    break;
1496  case MVT::f32: case MVT::f64:
1497  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1498  case MVT::v4f32: case MVT::v2f64:
1499  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1500  case MVT::v4f64:
1501    RRC = &X86::VR128RegClass;
1502    break;
1503  }
1504  return std::make_pair(RRC, Cost);
1505}
1506
1507bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1508                                               unsigned &Offset) const {
1509  if (!Subtarget->isTargetLinux())
1510    return false;
1511
1512  if (Subtarget->is64Bit()) {
1513    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1514    Offset = 0x28;
1515    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1516      AddressSpace = 256;
1517    else
1518      AddressSpace = 257;
1519  } else {
1520    // %gs:0x14 on i386
1521    Offset = 0x14;
1522    AddressSpace = 256;
1523  }
1524  return true;
1525}
1526
1527
1528//===----------------------------------------------------------------------===//
1529//               Return Value Calling Convention Implementation
1530//===----------------------------------------------------------------------===//
1531
1532#include "X86GenCallingConv.inc"
1533
1534bool
1535X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1536                                  MachineFunction &MF, bool isVarArg,
1537                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1538                        LLVMContext &Context) const {
1539  SmallVector<CCValAssign, 16> RVLocs;
1540  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1541                 RVLocs, Context);
1542  return CCInfo.CheckReturn(Outs, RetCC_X86);
1543}
1544
1545SDValue
1546X86TargetLowering::LowerReturn(SDValue Chain,
1547                               CallingConv::ID CallConv, bool isVarArg,
1548                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1549                               const SmallVectorImpl<SDValue> &OutVals,
1550                               DebugLoc dl, SelectionDAG &DAG) const {
1551  MachineFunction &MF = DAG.getMachineFunction();
1552  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1553
1554  SmallVector<CCValAssign, 16> RVLocs;
1555  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1556                 RVLocs, *DAG.getContext());
1557  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1558
1559  // Add the regs to the liveout set for the function.
1560  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1561  for (unsigned i = 0; i != RVLocs.size(); ++i)
1562    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1563      MRI.addLiveOut(RVLocs[i].getLocReg());
1564
1565  SDValue Flag;
1566
1567  SmallVector<SDValue, 6> RetOps;
1568  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1569  // Operand #1 = Bytes To Pop
1570  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1571                   MVT::i16));
1572
1573  // Copy the result values into the output registers.
1574  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1575    CCValAssign &VA = RVLocs[i];
1576    assert(VA.isRegLoc() && "Can only return in registers!");
1577    SDValue ValToCopy = OutVals[i];
1578    EVT ValVT = ValToCopy.getValueType();
1579
1580    // Promote values to the appropriate types
1581    if (VA.getLocInfo() == CCValAssign::SExt)
1582      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1583    else if (VA.getLocInfo() == CCValAssign::ZExt)
1584      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1585    else if (VA.getLocInfo() == CCValAssign::AExt)
1586      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1587    else if (VA.getLocInfo() == CCValAssign::BCvt)
1588      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1589
1590    // If this is x86-64, and we disabled SSE, we can't return FP values,
1591    // or SSE or MMX vectors.
1592    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1593         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1594          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1595      report_fatal_error("SSE register return with SSE disabled");
1596    }
1597    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1598    // llvm-gcc has never done it right and no one has noticed, so this
1599    // should be OK for now.
1600    if (ValVT == MVT::f64 &&
1601        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1602      report_fatal_error("SSE2 register return with SSE2 disabled");
1603
1604    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1605    // the RET instruction and handled by the FP Stackifier.
1606    if (VA.getLocReg() == X86::ST0 ||
1607        VA.getLocReg() == X86::ST1) {
1608      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1609      // change the value to the FP stack register class.
1610      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1611        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1612      RetOps.push_back(ValToCopy);
1613      // Don't emit a copytoreg.
1614      continue;
1615    }
1616
1617    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1618    // which is returned in RAX / RDX.
1619    if (Subtarget->is64Bit()) {
1620      if (ValVT == MVT::x86mmx) {
1621        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1622          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1623          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1624                                  ValToCopy);
1625          // If we don't have SSE2 available, convert to v4f32 so the generated
1626          // register is legal.
1627          if (!Subtarget->hasSSE2())
1628            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1629        }
1630      }
1631    }
1632
1633    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1634    Flag = Chain.getValue(1);
1635  }
1636
1637  // The x86-64 ABI for returning structs by value requires that we copy
1638  // the sret argument into %rax for the return. We saved the argument into
1639  // a virtual register in the entry block, so now we copy the value out
1640  // and into %rax.
1641  if (Subtarget->is64Bit() &&
1642      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1643    MachineFunction &MF = DAG.getMachineFunction();
1644    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1645    unsigned Reg = FuncInfo->getSRetReturnReg();
1646    assert(Reg &&
1647           "SRetReturnReg should have been set in LowerFormalArguments().");
1648    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1649
1650    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1651    Flag = Chain.getValue(1);
1652
1653    // RAX now acts like a return value.
1654    MRI.addLiveOut(X86::RAX);
1655  }
1656
1657  RetOps[0] = Chain;  // Update chain.
1658
1659  // Add the flag if we have it.
1660  if (Flag.getNode())
1661    RetOps.push_back(Flag);
1662
1663  return DAG.getNode(X86ISD::RET_FLAG, dl,
1664                     MVT::Other, &RetOps[0], RetOps.size());
1665}
1666
1667bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1668  if (N->getNumValues() != 1)
1669    return false;
1670  if (!N->hasNUsesOfValue(1, 0))
1671    return false;
1672
1673  SDValue TCChain = Chain;
1674  SDNode *Copy = *N->use_begin();
1675  if (Copy->getOpcode() == ISD::CopyToReg) {
1676    // If the copy has a glue operand, we conservatively assume it isn't safe to
1677    // perform a tail call.
1678    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1679      return false;
1680    TCChain = Copy->getOperand(0);
1681  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1682    return false;
1683
1684  bool HasRet = false;
1685  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1686       UI != UE; ++UI) {
1687    if (UI->getOpcode() != X86ISD::RET_FLAG)
1688      return false;
1689    HasRet = true;
1690  }
1691
1692  if (!HasRet)
1693    return false;
1694
1695  Chain = TCChain;
1696  return true;
1697}
1698
1699EVT
1700X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
1701                                            ISD::NodeType ExtendKind) const {
1702  MVT ReturnMVT;
1703  // TODO: Is this also valid on 32-bit?
1704  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1705    ReturnMVT = MVT::i8;
1706  else
1707    ReturnMVT = MVT::i32;
1708
1709  EVT MinVT = getRegisterType(Context, ReturnMVT);
1710  return VT.bitsLT(MinVT) ? MinVT : VT;
1711}
1712
1713/// LowerCallResult - Lower the result values of a call into the
1714/// appropriate copies out of appropriate physical registers.
1715///
1716SDValue
1717X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1718                                   CallingConv::ID CallConv, bool isVarArg,
1719                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1720                                   DebugLoc dl, SelectionDAG &DAG,
1721                                   SmallVectorImpl<SDValue> &InVals) const {
1722
1723  // Assign locations to each value returned by this call.
1724  SmallVector<CCValAssign, 16> RVLocs;
1725  bool Is64Bit = Subtarget->is64Bit();
1726  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1727                 getTargetMachine(), RVLocs, *DAG.getContext());
1728  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1729
1730  // Copy all of the result registers out of their specified physreg.
1731  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1732    CCValAssign &VA = RVLocs[i];
1733    EVT CopyVT = VA.getValVT();
1734
1735    // If this is x86-64, and we disabled SSE, we can't return FP values
1736    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1737        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1738      report_fatal_error("SSE register return with SSE disabled");
1739    }
1740
1741    SDValue Val;
1742
1743    // If this is a call to a function that returns an fp value on the floating
1744    // point stack, we must guarantee the value is popped from the stack, so
1745    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1746    // if the return value is not used. We use the FpPOP_RETVAL instruction
1747    // instead.
1748    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1749      // If we prefer to use the value in xmm registers, copy it out as f80 and
1750      // use a truncate to move it from fp stack reg to xmm reg.
1751      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1752      SDValue Ops[] = { Chain, InFlag };
1753      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1754                                         MVT::Other, MVT::Glue, Ops, 2), 1);
1755      Val = Chain.getValue(0);
1756
1757      // Round the f80 to the right size, which also moves it to the appropriate
1758      // xmm register.
1759      if (CopyVT != VA.getValVT())
1760        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1761                          // This truncation won't change the value.
1762                          DAG.getIntPtrConstant(1));
1763    } else {
1764      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1765                                 CopyVT, InFlag).getValue(1);
1766      Val = Chain.getValue(0);
1767    }
1768    InFlag = Chain.getValue(2);
1769    InVals.push_back(Val);
1770  }
1771
1772  return Chain;
1773}
1774
1775
1776//===----------------------------------------------------------------------===//
1777//                C & StdCall & Fast Calling Convention implementation
1778//===----------------------------------------------------------------------===//
1779//  StdCall calling convention seems to be standard for many Windows' API
1780//  routines and around. It differs from C calling convention just a little:
1781//  callee should clean up the stack, not caller. Symbols should be also
1782//  decorated in some fancy way :) It doesn't support any vector arguments.
1783//  For info on fast calling convention see Fast Calling Convention (tail call)
1784//  implementation LowerX86_32FastCCCallTo.
1785
1786/// CallIsStructReturn - Determines whether a call uses struct return
1787/// semantics.
1788enum StructReturnType {
1789  NotStructReturn,
1790  RegStructReturn,
1791  StackStructReturn
1792};
1793static StructReturnType
1794callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1795  if (Outs.empty())
1796    return NotStructReturn;
1797
1798  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
1799  if (!Flags.isSRet())
1800    return NotStructReturn;
1801  if (Flags.isInReg())
1802    return RegStructReturn;
1803  return StackStructReturn;
1804}
1805
1806/// ArgsAreStructReturn - Determines whether a function uses struct
1807/// return semantics.
1808static StructReturnType
1809argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1810  if (Ins.empty())
1811    return NotStructReturn;
1812
1813  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
1814  if (!Flags.isSRet())
1815    return NotStructReturn;
1816  if (Flags.isInReg())
1817    return RegStructReturn;
1818  return StackStructReturn;
1819}
1820
1821/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1822/// by "Src" to address "Dst" with size and alignment information specified by
1823/// the specific parameter attribute. The copy will be passed as a byval
1824/// function parameter.
1825static SDValue
1826CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1827                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1828                          DebugLoc dl) {
1829  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1830
1831  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1832                       /*isVolatile*/false, /*AlwaysInline=*/true,
1833                       MachinePointerInfo(), MachinePointerInfo());
1834}
1835
1836/// IsTailCallConvention - Return true if the calling convention is one that
1837/// supports tail call optimization.
1838static bool IsTailCallConvention(CallingConv::ID CC) {
1839  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1840          CC == CallingConv::HiPE);
1841}
1842
1843bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1844  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
1845    return false;
1846
1847  CallSite CS(CI);
1848  CallingConv::ID CalleeCC = CS.getCallingConv();
1849  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1850    return false;
1851
1852  return true;
1853}
1854
1855/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1856/// a tailcall target by changing its ABI.
1857static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
1858                                   bool GuaranteedTailCallOpt) {
1859  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1860}
1861
1862SDValue
1863X86TargetLowering::LowerMemArgument(SDValue Chain,
1864                                    CallingConv::ID CallConv,
1865                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1866                                    DebugLoc dl, SelectionDAG &DAG,
1867                                    const CCValAssign &VA,
1868                                    MachineFrameInfo *MFI,
1869                                    unsigned i) const {
1870  // Create the nodes corresponding to a load from this parameter slot.
1871  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1872  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
1873                              getTargetMachine().Options.GuaranteedTailCallOpt);
1874  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1875  EVT ValVT;
1876
1877  // If value is passed by pointer we have address passed instead of the value
1878  // itself.
1879  if (VA.getLocInfo() == CCValAssign::Indirect)
1880    ValVT = VA.getLocVT();
1881  else
1882    ValVT = VA.getValVT();
1883
1884  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1885  // changed with more analysis.
1886  // In case of tail call optimization mark all arguments mutable. Since they
1887  // could be overwritten by lowering of arguments in case of a tail call.
1888  if (Flags.isByVal()) {
1889    unsigned Bytes = Flags.getByValSize();
1890    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1891    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
1892    return DAG.getFrameIndex(FI, getPointerTy());
1893  } else {
1894    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1895                                    VA.getLocMemOffset(), isImmutable);
1896    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1897    return DAG.getLoad(ValVT, dl, Chain, FIN,
1898                       MachinePointerInfo::getFixedStack(FI),
1899                       false, false, false, 0);
1900  }
1901}
1902
1903SDValue
1904X86TargetLowering::LowerFormalArguments(SDValue Chain,
1905                                        CallingConv::ID CallConv,
1906                                        bool isVarArg,
1907                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1908                                        DebugLoc dl,
1909                                        SelectionDAG &DAG,
1910                                        SmallVectorImpl<SDValue> &InVals)
1911                                          const {
1912  MachineFunction &MF = DAG.getMachineFunction();
1913  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1914
1915  const Function* Fn = MF.getFunction();
1916  if (Fn->hasExternalLinkage() &&
1917      Subtarget->isTargetCygMing() &&
1918      Fn->getName() == "main")
1919    FuncInfo->setForceFramePointer(true);
1920
1921  MachineFrameInfo *MFI = MF.getFrameInfo();
1922  bool Is64Bit = Subtarget->is64Bit();
1923  bool IsWindows = Subtarget->isTargetWindows();
1924  bool IsWin64 = Subtarget->isTargetWin64();
1925
1926  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1927         "Var args not supported with calling convention fastcc, ghc or hipe");
1928
1929  // Assign locations to all of the incoming arguments.
1930  SmallVector<CCValAssign, 16> ArgLocs;
1931  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1932                 ArgLocs, *DAG.getContext());
1933
1934  // Allocate shadow area for Win64
1935  if (IsWin64) {
1936    CCInfo.AllocateStack(32, 8);
1937  }
1938
1939  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
1940
1941  unsigned LastVal = ~0U;
1942  SDValue ArgValue;
1943  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1944    CCValAssign &VA = ArgLocs[i];
1945    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1946    // places.
1947    assert(VA.getValNo() != LastVal &&
1948           "Don't support value assigned to multiple locs yet");
1949    (void)LastVal;
1950    LastVal = VA.getValNo();
1951
1952    if (VA.isRegLoc()) {
1953      EVT RegVT = VA.getLocVT();
1954      const TargetRegisterClass *RC;
1955      if (RegVT == MVT::i32)
1956        RC = &X86::GR32RegClass;
1957      else if (Is64Bit && RegVT == MVT::i64)
1958        RC = &X86::GR64RegClass;
1959      else if (RegVT == MVT::f32)
1960        RC = &X86::FR32RegClass;
1961      else if (RegVT == MVT::f64)
1962        RC = &X86::FR64RegClass;
1963      else if (RegVT.is256BitVector())
1964        RC = &X86::VR256RegClass;
1965      else if (RegVT.is128BitVector())
1966        RC = &X86::VR128RegClass;
1967      else if (RegVT == MVT::x86mmx)
1968        RC = &X86::VR64RegClass;
1969      else
1970        llvm_unreachable("Unknown argument type!");
1971
1972      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1973      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1974
1975      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1976      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1977      // right size.
1978      if (VA.getLocInfo() == CCValAssign::SExt)
1979        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1980                               DAG.getValueType(VA.getValVT()));
1981      else if (VA.getLocInfo() == CCValAssign::ZExt)
1982        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1983                               DAG.getValueType(VA.getValVT()));
1984      else if (VA.getLocInfo() == CCValAssign::BCvt)
1985        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
1986
1987      if (VA.isExtInLoc()) {
1988        // Handle MMX values passed in XMM regs.
1989        if (RegVT.isVector()) {
1990          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
1991                                 ArgValue);
1992        } else
1993          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1994      }
1995    } else {
1996      assert(VA.isMemLoc());
1997      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1998    }
1999
2000    // If value is passed via pointer - do a load.
2001    if (VA.getLocInfo() == CCValAssign::Indirect)
2002      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2003                             MachinePointerInfo(), false, false, false, 0);
2004
2005    InVals.push_back(ArgValue);
2006  }
2007
2008  // The x86-64 ABI for returning structs by value requires that we copy
2009  // the sret argument into %rax for the return. Save the argument into
2010  // a virtual register so that we can access it from the return points.
2011  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
2012    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2013    unsigned Reg = FuncInfo->getSRetReturnReg();
2014    if (!Reg) {
2015      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
2016      FuncInfo->setSRetReturnReg(Reg);
2017    }
2018    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
2019    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2020  }
2021
2022  unsigned StackSize = CCInfo.getNextStackOffset();
2023  // Align stack specially for tail calls.
2024  if (FuncIsMadeTailCallSafe(CallConv,
2025                             MF.getTarget().Options.GuaranteedTailCallOpt))
2026    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2027
2028  // If the function takes variable number of arguments, make a frame index for
2029  // the start of the first vararg value... for expansion of llvm.va_start.
2030  if (isVarArg) {
2031    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2032                    CallConv != CallingConv::X86_ThisCall)) {
2033      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
2034    }
2035    if (Is64Bit) {
2036      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
2037
2038      // FIXME: We should really autogenerate these arrays
2039      static const uint16_t GPR64ArgRegsWin64[] = {
2040        X86::RCX, X86::RDX, X86::R8,  X86::R9
2041      };
2042      static const uint16_t GPR64ArgRegs64Bit[] = {
2043        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2044      };
2045      static const uint16_t XMMArgRegs64Bit[] = {
2046        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2047        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2048      };
2049      const uint16_t *GPR64ArgRegs;
2050      unsigned NumXMMRegs = 0;
2051
2052      if (IsWin64) {
2053        // The XMM registers which might contain var arg parameters are shadowed
2054        // in their paired GPR.  So we only need to save the GPR to their home
2055        // slots.
2056        TotalNumIntRegs = 4;
2057        GPR64ArgRegs = GPR64ArgRegsWin64;
2058      } else {
2059        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
2060        GPR64ArgRegs = GPR64ArgRegs64Bit;
2061
2062        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
2063                                                TotalNumXMMRegs);
2064      }
2065      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
2066                                                       TotalNumIntRegs);
2067
2068      bool NoImplicitFloatOps = Fn->getFnAttributes().
2069        hasAttribute(Attributes::NoImplicitFloat);
2070      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2071             "SSE register cannot be used when SSE is disabled!");
2072      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
2073               NoImplicitFloatOps) &&
2074             "SSE register cannot be used when SSE is disabled!");
2075      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2076          !Subtarget->hasSSE1())
2077        // Kernel mode asks for SSE to be disabled, so don't push them
2078        // on the stack.
2079        TotalNumXMMRegs = 0;
2080
2081      if (IsWin64) {
2082        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
2083        // Get to the caller-allocated home save location.  Add 8 to account
2084        // for the return address.
2085        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2086        FuncInfo->setRegSaveFrameIndex(
2087          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2088        // Fixup to set vararg frame on shadow area (4 x i64).
2089        if (NumIntRegs < 4)
2090          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2091      } else {
2092        // For X86-64, if there are vararg parameters that are passed via
2093        // registers, then we must store them to their spots on the stack so
2094        // they may be loaded by deferencing the result of va_next.
2095        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2096        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2097        FuncInfo->setRegSaveFrameIndex(
2098          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2099                               false));
2100      }
2101
2102      // Store the integer parameter registers.
2103      SmallVector<SDValue, 8> MemOps;
2104      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2105                                        getPointerTy());
2106      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2107      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2108        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2109                                  DAG.getIntPtrConstant(Offset));
2110        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2111                                     &X86::GR64RegClass);
2112        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2113        SDValue Store =
2114          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2115                       MachinePointerInfo::getFixedStack(
2116                         FuncInfo->getRegSaveFrameIndex(), Offset),
2117                       false, false, 0);
2118        MemOps.push_back(Store);
2119        Offset += 8;
2120      }
2121
2122      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2123        // Now store the XMM (fp + vector) parameter registers.
2124        SmallVector<SDValue, 11> SaveXMMOps;
2125        SaveXMMOps.push_back(Chain);
2126
2127        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2128        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2129        SaveXMMOps.push_back(ALVal);
2130
2131        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2132                               FuncInfo->getRegSaveFrameIndex()));
2133        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2134                               FuncInfo->getVarArgsFPOffset()));
2135
2136        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2137          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2138                                       &X86::VR128RegClass);
2139          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2140          SaveXMMOps.push_back(Val);
2141        }
2142        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2143                                     MVT::Other,
2144                                     &SaveXMMOps[0], SaveXMMOps.size()));
2145      }
2146
2147      if (!MemOps.empty())
2148        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2149                            &MemOps[0], MemOps.size());
2150    }
2151  }
2152
2153  // Some CCs need callee pop.
2154  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2155                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2156    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2157  } else {
2158    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2159    // If this is an sret function, the return should pop the hidden pointer.
2160    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2161        argsAreStructReturn(Ins) == StackStructReturn)
2162      FuncInfo->setBytesToPopOnReturn(4);
2163  }
2164
2165  if (!Is64Bit) {
2166    // RegSaveFrameIndex is X86-64 only.
2167    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2168    if (CallConv == CallingConv::X86_FastCall ||
2169        CallConv == CallingConv::X86_ThisCall)
2170      // fastcc functions can't have varargs.
2171      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2172  }
2173
2174  FuncInfo->setArgumentStackSize(StackSize);
2175
2176  return Chain;
2177}
2178
2179SDValue
2180X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2181                                    SDValue StackPtr, SDValue Arg,
2182                                    DebugLoc dl, SelectionDAG &DAG,
2183                                    const CCValAssign &VA,
2184                                    ISD::ArgFlagsTy Flags) const {
2185  unsigned LocMemOffset = VA.getLocMemOffset();
2186  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2187  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2188  if (Flags.isByVal())
2189    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2190
2191  return DAG.getStore(Chain, dl, Arg, PtrOff,
2192                      MachinePointerInfo::getStack(LocMemOffset),
2193                      false, false, 0);
2194}
2195
2196/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2197/// optimization is performed and it is required.
2198SDValue
2199X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2200                                           SDValue &OutRetAddr, SDValue Chain,
2201                                           bool IsTailCall, bool Is64Bit,
2202                                           int FPDiff, DebugLoc dl) const {
2203  // Adjust the Return address stack slot.
2204  EVT VT = getPointerTy();
2205  OutRetAddr = getReturnAddressFrameIndex(DAG);
2206
2207  // Load the "old" Return address.
2208  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2209                           false, false, false, 0);
2210  return SDValue(OutRetAddr.getNode(), 1);
2211}
2212
2213/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2214/// optimization is performed and it is required (FPDiff!=0).
2215static SDValue
2216EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
2217                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
2218                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
2219  // Store the return address to the appropriate stack slot.
2220  if (!FPDiff) return Chain;
2221  // Calculate the new stack slot for the return address.
2222  int NewReturnAddrFI =
2223    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
2224  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2225  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2226                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2227                       false, false, 0);
2228  return Chain;
2229}
2230
2231SDValue
2232X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2233                             SmallVectorImpl<SDValue> &InVals) const {
2234  SelectionDAG &DAG                     = CLI.DAG;
2235  DebugLoc &dl                          = CLI.DL;
2236  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2237  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
2238  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
2239  SDValue Chain                         = CLI.Chain;
2240  SDValue Callee                        = CLI.Callee;
2241  CallingConv::ID CallConv              = CLI.CallConv;
2242  bool &isTailCall                      = CLI.IsTailCall;
2243  bool isVarArg                         = CLI.IsVarArg;
2244
2245  MachineFunction &MF = DAG.getMachineFunction();
2246  bool Is64Bit        = Subtarget->is64Bit();
2247  bool IsWin64        = Subtarget->isTargetWin64();
2248  bool IsWindows      = Subtarget->isTargetWindows();
2249  StructReturnType SR = callIsStructReturn(Outs);
2250  bool IsSibcall      = false;
2251
2252  if (MF.getTarget().Options.DisableTailCalls)
2253    isTailCall = false;
2254
2255  if (isTailCall) {
2256    // Check if it's really possible to do a tail call.
2257    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2258                    isVarArg, SR != NotStructReturn,
2259                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2260                    Outs, OutVals, Ins, DAG);
2261
2262    // Sibcalls are automatically detected tailcalls which do not require
2263    // ABI changes.
2264    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2265      IsSibcall = true;
2266
2267    if (isTailCall)
2268      ++NumTailCalls;
2269  }
2270
2271  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2272         "Var args not supported with calling convention fastcc, ghc or hipe");
2273
2274  // Analyze operands of the call, assigning locations to each operand.
2275  SmallVector<CCValAssign, 16> ArgLocs;
2276  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2277                 ArgLocs, *DAG.getContext());
2278
2279  // Allocate shadow area for Win64
2280  if (IsWin64) {
2281    CCInfo.AllocateStack(32, 8);
2282  }
2283
2284  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2285
2286  // Get a count of how many bytes are to be pushed on the stack.
2287  unsigned NumBytes = CCInfo.getNextStackOffset();
2288  if (IsSibcall)
2289    // This is a sibcall. The memory operands are available in caller's
2290    // own caller's stack.
2291    NumBytes = 0;
2292  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2293           IsTailCallConvention(CallConv))
2294    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2295
2296  int FPDiff = 0;
2297  if (isTailCall && !IsSibcall) {
2298    // Lower arguments at fp - stackoffset + fpdiff.
2299    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2300    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2301
2302    FPDiff = NumBytesCallerPushed - NumBytes;
2303
2304    // Set the delta of movement of the returnaddr stackslot.
2305    // But only set if delta is greater than previous delta.
2306    if (FPDiff < X86Info->getTCReturnAddrDelta())
2307      X86Info->setTCReturnAddrDelta(FPDiff);
2308  }
2309
2310  if (!IsSibcall)
2311    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
2312
2313  SDValue RetAddrFrIdx;
2314  // Load return address for tail calls.
2315  if (isTailCall && FPDiff)
2316    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2317                                    Is64Bit, FPDiff, dl);
2318
2319  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2320  SmallVector<SDValue, 8> MemOpChains;
2321  SDValue StackPtr;
2322
2323  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2324  // of tail call optimization arguments are handle later.
2325  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2326    CCValAssign &VA = ArgLocs[i];
2327    EVT RegVT = VA.getLocVT();
2328    SDValue Arg = OutVals[i];
2329    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2330    bool isByVal = Flags.isByVal();
2331
2332    // Promote the value if needed.
2333    switch (VA.getLocInfo()) {
2334    default: llvm_unreachable("Unknown loc info!");
2335    case CCValAssign::Full: break;
2336    case CCValAssign::SExt:
2337      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2338      break;
2339    case CCValAssign::ZExt:
2340      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2341      break;
2342    case CCValAssign::AExt:
2343      if (RegVT.is128BitVector()) {
2344        // Special case: passing MMX values in XMM registers.
2345        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2346        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2347        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2348      } else
2349        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2350      break;
2351    case CCValAssign::BCvt:
2352      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2353      break;
2354    case CCValAssign::Indirect: {
2355      // Store the argument.
2356      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2357      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2358      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2359                           MachinePointerInfo::getFixedStack(FI),
2360                           false, false, 0);
2361      Arg = SpillSlot;
2362      break;
2363    }
2364    }
2365
2366    if (VA.isRegLoc()) {
2367      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2368      if (isVarArg && IsWin64) {
2369        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2370        // shadow reg if callee is a varargs function.
2371        unsigned ShadowReg = 0;
2372        switch (VA.getLocReg()) {
2373        case X86::XMM0: ShadowReg = X86::RCX; break;
2374        case X86::XMM1: ShadowReg = X86::RDX; break;
2375        case X86::XMM2: ShadowReg = X86::R8; break;
2376        case X86::XMM3: ShadowReg = X86::R9; break;
2377        }
2378        if (ShadowReg)
2379          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2380      }
2381    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2382      assert(VA.isMemLoc());
2383      if (StackPtr.getNode() == 0)
2384        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2385                                      getPointerTy());
2386      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2387                                             dl, DAG, VA, Flags));
2388    }
2389  }
2390
2391  if (!MemOpChains.empty())
2392    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2393                        &MemOpChains[0], MemOpChains.size());
2394
2395  if (Subtarget->isPICStyleGOT()) {
2396    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2397    // GOT pointer.
2398    if (!isTailCall) {
2399      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2400               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
2401    } else {
2402      // If we are tail calling and generating PIC/GOT style code load the
2403      // address of the callee into ECX. The value in ecx is used as target of
2404      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2405      // for tail calls on PIC/GOT architectures. Normally we would just put the
2406      // address of GOT into ebx and then call target@PLT. But for tail calls
2407      // ebx would be restored (since ebx is callee saved) before jumping to the
2408      // target@PLT.
2409
2410      // Note: The actual moving to ECX is done further down.
2411      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2412      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2413          !G->getGlobal()->hasProtectedVisibility())
2414        Callee = LowerGlobalAddress(Callee, DAG);
2415      else if (isa<ExternalSymbolSDNode>(Callee))
2416        Callee = LowerExternalSymbol(Callee, DAG);
2417    }
2418  }
2419
2420  if (Is64Bit && isVarArg && !IsWin64) {
2421    // From AMD64 ABI document:
2422    // For calls that may call functions that use varargs or stdargs
2423    // (prototype-less calls or calls to functions containing ellipsis (...) in
2424    // the declaration) %al is used as hidden argument to specify the number
2425    // of SSE registers used. The contents of %al do not need to match exactly
2426    // the number of registers, but must be an ubound on the number of SSE
2427    // registers used and is in the range 0 - 8 inclusive.
2428
2429    // Count the number of XMM registers allocated.
2430    static const uint16_t XMMArgRegs[] = {
2431      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2432      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2433    };
2434    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2435    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2436           && "SSE registers cannot be used when SSE is disabled");
2437
2438    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2439                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
2440  }
2441
2442  // For tail calls lower the arguments to the 'real' stack slot.
2443  if (isTailCall) {
2444    // Force all the incoming stack arguments to be loaded from the stack
2445    // before any new outgoing arguments are stored to the stack, because the
2446    // outgoing stack slots may alias the incoming argument stack slots, and
2447    // the alias isn't otherwise explicit. This is slightly more conservative
2448    // than necessary, because it means that each store effectively depends
2449    // on every argument instead of just those arguments it would clobber.
2450    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2451
2452    SmallVector<SDValue, 8> MemOpChains2;
2453    SDValue FIN;
2454    int FI = 0;
2455    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2456      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2457        CCValAssign &VA = ArgLocs[i];
2458        if (VA.isRegLoc())
2459          continue;
2460        assert(VA.isMemLoc());
2461        SDValue Arg = OutVals[i];
2462        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2463        // Create frame index.
2464        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2465        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2466        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2467        FIN = DAG.getFrameIndex(FI, getPointerTy());
2468
2469        if (Flags.isByVal()) {
2470          // Copy relative to framepointer.
2471          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2472          if (StackPtr.getNode() == 0)
2473            StackPtr = DAG.getCopyFromReg(Chain, dl,
2474                                          RegInfo->getStackRegister(),
2475                                          getPointerTy());
2476          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2477
2478          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2479                                                           ArgChain,
2480                                                           Flags, DAG, dl));
2481        } else {
2482          // Store relative to framepointer.
2483          MemOpChains2.push_back(
2484            DAG.getStore(ArgChain, dl, Arg, FIN,
2485                         MachinePointerInfo::getFixedStack(FI),
2486                         false, false, 0));
2487        }
2488      }
2489    }
2490
2491    if (!MemOpChains2.empty())
2492      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2493                          &MemOpChains2[0], MemOpChains2.size());
2494
2495    // Store the return address to the appropriate stack slot.
2496    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2497                                     getPointerTy(), RegInfo->getSlotSize(),
2498                                     FPDiff, dl);
2499  }
2500
2501  // Build a sequence of copy-to-reg nodes chained together with token chain
2502  // and flag operands which copy the outgoing args into registers.
2503  SDValue InFlag;
2504  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2505    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2506                             RegsToPass[i].second, InFlag);
2507    InFlag = Chain.getValue(1);
2508  }
2509
2510  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2511    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2512    // In the 64-bit large code model, we have to make all calls
2513    // through a register, since the call instruction's 32-bit
2514    // pc-relative offset may not be large enough to hold the whole
2515    // address.
2516  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2517    // If the callee is a GlobalAddress node (quite common, every direct call
2518    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2519    // it.
2520
2521    // We should use extra load for direct calls to dllimported functions in
2522    // non-JIT mode.
2523    const GlobalValue *GV = G->getGlobal();
2524    if (!GV->hasDLLImportLinkage()) {
2525      unsigned char OpFlags = 0;
2526      bool ExtraLoad = false;
2527      unsigned WrapperKind = ISD::DELETED_NODE;
2528
2529      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2530      // external symbols most go through the PLT in PIC mode.  If the symbol
2531      // has hidden or protected visibility, or if it is static or local, then
2532      // we don't need to use the PLT - we can directly call it.
2533      if (Subtarget->isTargetELF() &&
2534          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2535          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2536        OpFlags = X86II::MO_PLT;
2537      } else if (Subtarget->isPICStyleStubAny() &&
2538                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2539                 (!Subtarget->getTargetTriple().isMacOSX() ||
2540                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2541        // PC-relative references to external symbols should go through $stub,
2542        // unless we're building with the leopard linker or later, which
2543        // automatically synthesizes these stubs.
2544        OpFlags = X86II::MO_DARWIN_STUB;
2545      } else if (Subtarget->isPICStyleRIPRel() &&
2546                 isa<Function>(GV) &&
2547                 cast<Function>(GV)->getFnAttributes().
2548                   hasAttribute(Attributes::NonLazyBind)) {
2549        // If the function is marked as non-lazy, generate an indirect call
2550        // which loads from the GOT directly. This avoids runtime overhead
2551        // at the cost of eager binding (and one extra byte of encoding).
2552        OpFlags = X86II::MO_GOTPCREL;
2553        WrapperKind = X86ISD::WrapperRIP;
2554        ExtraLoad = true;
2555      }
2556
2557      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2558                                          G->getOffset(), OpFlags);
2559
2560      // Add a wrapper if needed.
2561      if (WrapperKind != ISD::DELETED_NODE)
2562        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2563      // Add extra indirection if needed.
2564      if (ExtraLoad)
2565        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2566                             MachinePointerInfo::getGOT(),
2567                             false, false, false, 0);
2568    }
2569  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2570    unsigned char OpFlags = 0;
2571
2572    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2573    // external symbols should go through the PLT.
2574    if (Subtarget->isTargetELF() &&
2575        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2576      OpFlags = X86II::MO_PLT;
2577    } else if (Subtarget->isPICStyleStubAny() &&
2578               (!Subtarget->getTargetTriple().isMacOSX() ||
2579                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2580      // PC-relative references to external symbols should go through $stub,
2581      // unless we're building with the leopard linker or later, which
2582      // automatically synthesizes these stubs.
2583      OpFlags = X86II::MO_DARWIN_STUB;
2584    }
2585
2586    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2587                                         OpFlags);
2588  }
2589
2590  // Returns a chain & a flag for retval copy to use.
2591  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2592  SmallVector<SDValue, 8> Ops;
2593
2594  if (!IsSibcall && isTailCall) {
2595    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2596                           DAG.getIntPtrConstant(0, true), InFlag);
2597    InFlag = Chain.getValue(1);
2598  }
2599
2600  Ops.push_back(Chain);
2601  Ops.push_back(Callee);
2602
2603  if (isTailCall)
2604    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2605
2606  // Add argument registers to the end of the list so that they are known live
2607  // into the call.
2608  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2609    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2610                                  RegsToPass[i].second.getValueType()));
2611
2612  // Add a register mask operand representing the call-preserved registers.
2613  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2614  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2615  assert(Mask && "Missing call preserved mask for calling convention");
2616  Ops.push_back(DAG.getRegisterMask(Mask));
2617
2618  if (InFlag.getNode())
2619    Ops.push_back(InFlag);
2620
2621  if (isTailCall) {
2622    // We used to do:
2623    //// If this is the first return lowered for this function, add the regs
2624    //// to the liveout set for the function.
2625    // This isn't right, although it's probably harmless on x86; liveouts
2626    // should be computed from returns not tail calls.  Consider a void
2627    // function making a tail call to a function returning int.
2628    return DAG.getNode(X86ISD::TC_RETURN, dl,
2629                       NodeTys, &Ops[0], Ops.size());
2630  }
2631
2632  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2633  InFlag = Chain.getValue(1);
2634
2635  // Create the CALLSEQ_END node.
2636  unsigned NumBytesForCalleeToPush;
2637  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2638                       getTargetMachine().Options.GuaranteedTailCallOpt))
2639    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2640  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2641           SR == StackStructReturn)
2642    // If this is a call to a struct-return function, the callee
2643    // pops the hidden struct pointer, so we have to push it back.
2644    // This is common for Darwin/X86, Linux & Mingw32 targets.
2645    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2646    NumBytesForCalleeToPush = 4;
2647  else
2648    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2649
2650  // Returns a flag for retval copy to use.
2651  if (!IsSibcall) {
2652    Chain = DAG.getCALLSEQ_END(Chain,
2653                               DAG.getIntPtrConstant(NumBytes, true),
2654                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2655                                                     true),
2656                               InFlag);
2657    InFlag = Chain.getValue(1);
2658  }
2659
2660  // Handle result values, copying them out of physregs into vregs that we
2661  // return.
2662  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2663                         Ins, dl, DAG, InVals);
2664}
2665
2666
2667//===----------------------------------------------------------------------===//
2668//                Fast Calling Convention (tail call) implementation
2669//===----------------------------------------------------------------------===//
2670
2671//  Like std call, callee cleans arguments, convention except that ECX is
2672//  reserved for storing the tail called function address. Only 2 registers are
2673//  free for argument passing (inreg). Tail call optimization is performed
2674//  provided:
2675//                * tailcallopt is enabled
2676//                * caller/callee are fastcc
2677//  On X86_64 architecture with GOT-style position independent code only local
2678//  (within module) calls are supported at the moment.
2679//  To keep the stack aligned according to platform abi the function
2680//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2681//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2682//  If a tail called function callee has more arguments than the caller the
2683//  caller needs to make sure that there is room to move the RETADDR to. This is
2684//  achieved by reserving an area the size of the argument delta right after the
2685//  original REtADDR, but before the saved framepointer or the spilled registers
2686//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2687//  stack layout:
2688//    arg1
2689//    arg2
2690//    RETADDR
2691//    [ new RETADDR
2692//      move area ]
2693//    (possible EBP)
2694//    ESI
2695//    EDI
2696//    local1 ..
2697
2698/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2699/// for a 16 byte align requirement.
2700unsigned
2701X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2702                                               SelectionDAG& DAG) const {
2703  MachineFunction &MF = DAG.getMachineFunction();
2704  const TargetMachine &TM = MF.getTarget();
2705  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2706  unsigned StackAlignment = TFI.getStackAlignment();
2707  uint64_t AlignMask = StackAlignment - 1;
2708  int64_t Offset = StackSize;
2709  unsigned SlotSize = RegInfo->getSlotSize();
2710  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2711    // Number smaller than 12 so just add the difference.
2712    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2713  } else {
2714    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2715    Offset = ((~AlignMask) & Offset) + StackAlignment +
2716      (StackAlignment-SlotSize);
2717  }
2718  return Offset;
2719}
2720
2721/// MatchingStackOffset - Return true if the given stack call argument is
2722/// already available in the same position (relatively) of the caller's
2723/// incoming argument stack.
2724static
2725bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2726                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2727                         const X86InstrInfo *TII) {
2728  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2729  int FI = INT_MAX;
2730  if (Arg.getOpcode() == ISD::CopyFromReg) {
2731    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2732    if (!TargetRegisterInfo::isVirtualRegister(VR))
2733      return false;
2734    MachineInstr *Def = MRI->getVRegDef(VR);
2735    if (!Def)
2736      return false;
2737    if (!Flags.isByVal()) {
2738      if (!TII->isLoadFromStackSlot(Def, FI))
2739        return false;
2740    } else {
2741      unsigned Opcode = Def->getOpcode();
2742      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2743          Def->getOperand(1).isFI()) {
2744        FI = Def->getOperand(1).getIndex();
2745        Bytes = Flags.getByValSize();
2746      } else
2747        return false;
2748    }
2749  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2750    if (Flags.isByVal())
2751      // ByVal argument is passed in as a pointer but it's now being
2752      // dereferenced. e.g.
2753      // define @foo(%struct.X* %A) {
2754      //   tail call @bar(%struct.X* byval %A)
2755      // }
2756      return false;
2757    SDValue Ptr = Ld->getBasePtr();
2758    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2759    if (!FINode)
2760      return false;
2761    FI = FINode->getIndex();
2762  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2763    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2764    FI = FINode->getIndex();
2765    Bytes = Flags.getByValSize();
2766  } else
2767    return false;
2768
2769  assert(FI != INT_MAX);
2770  if (!MFI->isFixedObjectIndex(FI))
2771    return false;
2772  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2773}
2774
2775/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2776/// for tail call optimization. Targets which want to do tail call
2777/// optimization should implement this function.
2778bool
2779X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2780                                                     CallingConv::ID CalleeCC,
2781                                                     bool isVarArg,
2782                                                     bool isCalleeStructRet,
2783                                                     bool isCallerStructRet,
2784                                                     Type *RetTy,
2785                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2786                                    const SmallVectorImpl<SDValue> &OutVals,
2787                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2788                                                     SelectionDAG& DAG) const {
2789  if (!IsTailCallConvention(CalleeCC) &&
2790      CalleeCC != CallingConv::C)
2791    return false;
2792
2793  // If -tailcallopt is specified, make fastcc functions tail-callable.
2794  const MachineFunction &MF = DAG.getMachineFunction();
2795  const Function *CallerF = DAG.getMachineFunction().getFunction();
2796
2797  // If the function return type is x86_fp80 and the callee return type is not,
2798  // then the FP_EXTEND of the call result is not a nop. It's not safe to
2799  // perform a tailcall optimization here.
2800  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
2801    return false;
2802
2803  CallingConv::ID CallerCC = CallerF->getCallingConv();
2804  bool CCMatch = CallerCC == CalleeCC;
2805
2806  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2807    if (IsTailCallConvention(CalleeCC) && CCMatch)
2808      return true;
2809    return false;
2810  }
2811
2812  // Look for obvious safe cases to perform tail call optimization that do not
2813  // require ABI changes. This is what gcc calls sibcall.
2814
2815  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2816  // emit a special epilogue.
2817  if (RegInfo->needsStackRealignment(MF))
2818    return false;
2819
2820  // Also avoid sibcall optimization if either caller or callee uses struct
2821  // return semantics.
2822  if (isCalleeStructRet || isCallerStructRet)
2823    return false;
2824
2825  // An stdcall caller is expected to clean up its arguments; the callee
2826  // isn't going to do that.
2827  if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
2828    return false;
2829
2830  // Do not sibcall optimize vararg calls unless all arguments are passed via
2831  // registers.
2832  if (isVarArg && !Outs.empty()) {
2833
2834    // Optimizing for varargs on Win64 is unlikely to be safe without
2835    // additional testing.
2836    if (Subtarget->isTargetWin64())
2837      return false;
2838
2839    SmallVector<CCValAssign, 16> ArgLocs;
2840    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2841                   getTargetMachine(), ArgLocs, *DAG.getContext());
2842
2843    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2844    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2845      if (!ArgLocs[i].isRegLoc())
2846        return false;
2847  }
2848
2849  // If the call result is in ST0 / ST1, it needs to be popped off the x87
2850  // stack.  Therefore, if it's not used by the call it is not safe to optimize
2851  // this into a sibcall.
2852  bool Unused = false;
2853  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2854    if (!Ins[i].Used) {
2855      Unused = true;
2856      break;
2857    }
2858  }
2859  if (Unused) {
2860    SmallVector<CCValAssign, 16> RVLocs;
2861    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
2862                   getTargetMachine(), RVLocs, *DAG.getContext());
2863    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2864    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2865      CCValAssign &VA = RVLocs[i];
2866      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2867        return false;
2868    }
2869  }
2870
2871  // If the calling conventions do not match, then we'd better make sure the
2872  // results are returned in the same way as what the caller expects.
2873  if (!CCMatch) {
2874    SmallVector<CCValAssign, 16> RVLocs1;
2875    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2876                    getTargetMachine(), RVLocs1, *DAG.getContext());
2877    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2878
2879    SmallVector<CCValAssign, 16> RVLocs2;
2880    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2881                    getTargetMachine(), RVLocs2, *DAG.getContext());
2882    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2883
2884    if (RVLocs1.size() != RVLocs2.size())
2885      return false;
2886    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2887      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2888        return false;
2889      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2890        return false;
2891      if (RVLocs1[i].isRegLoc()) {
2892        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2893          return false;
2894      } else {
2895        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2896          return false;
2897      }
2898    }
2899  }
2900
2901  // If the callee takes no arguments then go on to check the results of the
2902  // call.
2903  if (!Outs.empty()) {
2904    // Check if stack adjustment is needed. For now, do not do this if any
2905    // argument is passed on the stack.
2906    SmallVector<CCValAssign, 16> ArgLocs;
2907    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2908                   getTargetMachine(), ArgLocs, *DAG.getContext());
2909
2910    // Allocate shadow area for Win64
2911    if (Subtarget->isTargetWin64()) {
2912      CCInfo.AllocateStack(32, 8);
2913    }
2914
2915    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2916    if (CCInfo.getNextStackOffset()) {
2917      MachineFunction &MF = DAG.getMachineFunction();
2918      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2919        return false;
2920
2921      // Check if the arguments are already laid out in the right way as
2922      // the caller's fixed stack objects.
2923      MachineFrameInfo *MFI = MF.getFrameInfo();
2924      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2925      const X86InstrInfo *TII =
2926        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
2927      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2928        CCValAssign &VA = ArgLocs[i];
2929        SDValue Arg = OutVals[i];
2930        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2931        if (VA.getLocInfo() == CCValAssign::Indirect)
2932          return false;
2933        if (!VA.isRegLoc()) {
2934          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2935                                   MFI, MRI, TII))
2936            return false;
2937        }
2938      }
2939    }
2940
2941    // If the tailcall address may be in a register, then make sure it's
2942    // possible to register allocate for it. In 32-bit, the call address can
2943    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2944    // callee-saved registers are restored. These happen to be the same
2945    // registers used to pass 'inreg' arguments so watch out for those.
2946    if (!Subtarget->is64Bit() &&
2947        !isa<GlobalAddressSDNode>(Callee) &&
2948        !isa<ExternalSymbolSDNode>(Callee)) {
2949      unsigned NumInRegs = 0;
2950      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2951        CCValAssign &VA = ArgLocs[i];
2952        if (!VA.isRegLoc())
2953          continue;
2954        unsigned Reg = VA.getLocReg();
2955        switch (Reg) {
2956        default: break;
2957        case X86::EAX: case X86::EDX: case X86::ECX:
2958          if (++NumInRegs == 3)
2959            return false;
2960          break;
2961        }
2962      }
2963    }
2964  }
2965
2966  return true;
2967}
2968
2969FastISel *
2970X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2971                                  const TargetLibraryInfo *libInfo) const {
2972  return X86::createFastISel(funcInfo, libInfo);
2973}
2974
2975
2976//===----------------------------------------------------------------------===//
2977//                           Other Lowering Hooks
2978//===----------------------------------------------------------------------===//
2979
2980static bool MayFoldLoad(SDValue Op) {
2981  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
2982}
2983
2984static bool MayFoldIntoStore(SDValue Op) {
2985  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2986}
2987
2988static bool isTargetShuffle(unsigned Opcode) {
2989  switch(Opcode) {
2990  default: return false;
2991  case X86ISD::PSHUFD:
2992  case X86ISD::PSHUFHW:
2993  case X86ISD::PSHUFLW:
2994  case X86ISD::SHUFP:
2995  case X86ISD::PALIGN:
2996  case X86ISD::MOVLHPS:
2997  case X86ISD::MOVLHPD:
2998  case X86ISD::MOVHLPS:
2999  case X86ISD::MOVLPS:
3000  case X86ISD::MOVLPD:
3001  case X86ISD::MOVSHDUP:
3002  case X86ISD::MOVSLDUP:
3003  case X86ISD::MOVDDUP:
3004  case X86ISD::MOVSS:
3005  case X86ISD::MOVSD:
3006  case X86ISD::UNPCKL:
3007  case X86ISD::UNPCKH:
3008  case X86ISD::VPERMILP:
3009  case X86ISD::VPERM2X128:
3010  case X86ISD::VPERMI:
3011    return true;
3012  }
3013}
3014
3015static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3016                                    SDValue V1, SelectionDAG &DAG) {
3017  switch(Opc) {
3018  default: llvm_unreachable("Unknown x86 shuffle node");
3019  case X86ISD::MOVSHDUP:
3020  case X86ISD::MOVSLDUP:
3021  case X86ISD::MOVDDUP:
3022    return DAG.getNode(Opc, dl, VT, V1);
3023  }
3024}
3025
3026static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3027                                    SDValue V1, unsigned TargetMask,
3028                                    SelectionDAG &DAG) {
3029  switch(Opc) {
3030  default: llvm_unreachable("Unknown x86 shuffle node");
3031  case X86ISD::PSHUFD:
3032  case X86ISD::PSHUFHW:
3033  case X86ISD::PSHUFLW:
3034  case X86ISD::VPERMILP:
3035  case X86ISD::VPERMI:
3036    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3037  }
3038}
3039
3040static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3041                                    SDValue V1, SDValue V2, unsigned TargetMask,
3042                                    SelectionDAG &DAG) {
3043  switch(Opc) {
3044  default: llvm_unreachable("Unknown x86 shuffle node");
3045  case X86ISD::PALIGN:
3046  case X86ISD::SHUFP:
3047  case X86ISD::VPERM2X128:
3048    return DAG.getNode(Opc, dl, VT, V1, V2,
3049                       DAG.getConstant(TargetMask, MVT::i8));
3050  }
3051}
3052
3053static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3054                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
3055  switch(Opc) {
3056  default: llvm_unreachable("Unknown x86 shuffle node");
3057  case X86ISD::MOVLHPS:
3058  case X86ISD::MOVLHPD:
3059  case X86ISD::MOVHLPS:
3060  case X86ISD::MOVLPS:
3061  case X86ISD::MOVLPD:
3062  case X86ISD::MOVSS:
3063  case X86ISD::MOVSD:
3064  case X86ISD::UNPCKL:
3065  case X86ISD::UNPCKH:
3066    return DAG.getNode(Opc, dl, VT, V1, V2);
3067  }
3068}
3069
3070SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3071  MachineFunction &MF = DAG.getMachineFunction();
3072  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3073  int ReturnAddrIndex = FuncInfo->getRAIndex();
3074
3075  if (ReturnAddrIndex == 0) {
3076    // Set up a frame object for the return address.
3077    unsigned SlotSize = RegInfo->getSlotSize();
3078    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
3079                                                           false);
3080    FuncInfo->setRAIndex(ReturnAddrIndex);
3081  }
3082
3083  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3084}
3085
3086
3087bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3088                                       bool hasSymbolicDisplacement) {
3089  // Offset should fit into 32 bit immediate field.
3090  if (!isInt<32>(Offset))
3091    return false;
3092
3093  // If we don't have a symbolic displacement - we don't have any extra
3094  // restrictions.
3095  if (!hasSymbolicDisplacement)
3096    return true;
3097
3098  // FIXME: Some tweaks might be needed for medium code model.
3099  if (M != CodeModel::Small && M != CodeModel::Kernel)
3100    return false;
3101
3102  // For small code model we assume that latest object is 16MB before end of 31
3103  // bits boundary. We may also accept pretty large negative constants knowing
3104  // that all objects are in the positive half of address space.
3105  if (M == CodeModel::Small && Offset < 16*1024*1024)
3106    return true;
3107
3108  // For kernel code model we know that all object resist in the negative half
3109  // of 32bits address space. We may not accept negative offsets, since they may
3110  // be just off and we may accept pretty large positive ones.
3111  if (M == CodeModel::Kernel && Offset > 0)
3112    return true;
3113
3114  return false;
3115}
3116
3117/// isCalleePop - Determines whether the callee is required to pop its
3118/// own arguments. Callee pop is necessary to support tail calls.
3119bool X86::isCalleePop(CallingConv::ID CallingConv,
3120                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3121  if (IsVarArg)
3122    return false;
3123
3124  switch (CallingConv) {
3125  default:
3126    return false;
3127  case CallingConv::X86_StdCall:
3128    return !is64Bit;
3129  case CallingConv::X86_FastCall:
3130    return !is64Bit;
3131  case CallingConv::X86_ThisCall:
3132    return !is64Bit;
3133  case CallingConv::Fast:
3134    return TailCallOpt;
3135  case CallingConv::GHC:
3136    return TailCallOpt;
3137  case CallingConv::HiPE:
3138    return TailCallOpt;
3139  }
3140}
3141
3142/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3143/// specific condition code, returning the condition code and the LHS/RHS of the
3144/// comparison to make.
3145static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3146                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3147  if (!isFP) {
3148    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3149      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3150        // X > -1   -> X == 0, jump !sign.
3151        RHS = DAG.getConstant(0, RHS.getValueType());
3152        return X86::COND_NS;
3153      }
3154      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3155        // X < 0   -> X == 0, jump on sign.
3156        return X86::COND_S;
3157      }
3158      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3159        // X < 1   -> X <= 0
3160        RHS = DAG.getConstant(0, RHS.getValueType());
3161        return X86::COND_LE;
3162      }
3163    }
3164
3165    switch (SetCCOpcode) {
3166    default: llvm_unreachable("Invalid integer condition!");
3167    case ISD::SETEQ:  return X86::COND_E;
3168    case ISD::SETGT:  return X86::COND_G;
3169    case ISD::SETGE:  return X86::COND_GE;
3170    case ISD::SETLT:  return X86::COND_L;
3171    case ISD::SETLE:  return X86::COND_LE;
3172    case ISD::SETNE:  return X86::COND_NE;
3173    case ISD::SETULT: return X86::COND_B;
3174    case ISD::SETUGT: return X86::COND_A;
3175    case ISD::SETULE: return X86::COND_BE;
3176    case ISD::SETUGE: return X86::COND_AE;
3177    }
3178  }
3179
3180  // First determine if it is required or is profitable to flip the operands.
3181
3182  // If LHS is a foldable load, but RHS is not, flip the condition.
3183  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3184      !ISD::isNON_EXTLoad(RHS.getNode())) {
3185    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3186    std::swap(LHS, RHS);
3187  }
3188
3189  switch (SetCCOpcode) {
3190  default: break;
3191  case ISD::SETOLT:
3192  case ISD::SETOLE:
3193  case ISD::SETUGT:
3194  case ISD::SETUGE:
3195    std::swap(LHS, RHS);
3196    break;
3197  }
3198
3199  // On a floating point condition, the flags are set as follows:
3200  // ZF  PF  CF   op
3201  //  0 | 0 | 0 | X > Y
3202  //  0 | 0 | 1 | X < Y
3203  //  1 | 0 | 0 | X == Y
3204  //  1 | 1 | 1 | unordered
3205  switch (SetCCOpcode) {
3206  default: llvm_unreachable("Condcode should be pre-legalized away");
3207  case ISD::SETUEQ:
3208  case ISD::SETEQ:   return X86::COND_E;
3209  case ISD::SETOLT:              // flipped
3210  case ISD::SETOGT:
3211  case ISD::SETGT:   return X86::COND_A;
3212  case ISD::SETOLE:              // flipped
3213  case ISD::SETOGE:
3214  case ISD::SETGE:   return X86::COND_AE;
3215  case ISD::SETUGT:              // flipped
3216  case ISD::SETULT:
3217  case ISD::SETLT:   return X86::COND_B;
3218  case ISD::SETUGE:              // flipped
3219  case ISD::SETULE:
3220  case ISD::SETLE:   return X86::COND_BE;
3221  case ISD::SETONE:
3222  case ISD::SETNE:   return X86::COND_NE;
3223  case ISD::SETUO:   return X86::COND_P;
3224  case ISD::SETO:    return X86::COND_NP;
3225  case ISD::SETOEQ:
3226  case ISD::SETUNE:  return X86::COND_INVALID;
3227  }
3228}
3229
3230/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3231/// code. Current x86 isa includes the following FP cmov instructions:
3232/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3233static bool hasFPCMov(unsigned X86CC) {
3234  switch (X86CC) {
3235  default:
3236    return false;
3237  case X86::COND_B:
3238  case X86::COND_BE:
3239  case X86::COND_E:
3240  case X86::COND_P:
3241  case X86::COND_A:
3242  case X86::COND_AE:
3243  case X86::COND_NE:
3244  case X86::COND_NP:
3245    return true;
3246  }
3247}
3248
3249/// isFPImmLegal - Returns true if the target can instruction select the
3250/// specified FP immediate natively. If false, the legalizer will
3251/// materialize the FP immediate as a load from a constant pool.
3252bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3253  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3254    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3255      return true;
3256  }
3257  return false;
3258}
3259
3260/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3261/// the specified range (L, H].
3262static bool isUndefOrInRange(int Val, int Low, int Hi) {
3263  return (Val < 0) || (Val >= Low && Val < Hi);
3264}
3265
3266/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3267/// specified value.
3268static bool isUndefOrEqual(int Val, int CmpVal) {
3269  return (Val < 0 || Val == CmpVal);
3270}
3271
3272/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3273/// from position Pos and ending in Pos+Size, falls within the specified
3274/// sequential range (L, L+Pos]. or is undef.
3275static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3276                                       unsigned Pos, unsigned Size, int Low) {
3277  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3278    if (!isUndefOrEqual(Mask[i], Low))
3279      return false;
3280  return true;
3281}
3282
3283/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3284/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3285/// the second operand.
3286static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
3287  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3288    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3289  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3290    return (Mask[0] < 2 && Mask[1] < 2);
3291  return false;
3292}
3293
3294/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3295/// is suitable for input to PSHUFHW.
3296static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3297  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3298    return false;
3299
3300  // Lower quadword copied in order or undef.
3301  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3302    return false;
3303
3304  // Upper quadword shuffled.
3305  for (unsigned i = 4; i != 8; ++i)
3306    if (!isUndefOrInRange(Mask[i], 4, 8))
3307      return false;
3308
3309  if (VT == MVT::v16i16) {
3310    // Lower quadword copied in order or undef.
3311    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3312      return false;
3313
3314    // Upper quadword shuffled.
3315    for (unsigned i = 12; i != 16; ++i)
3316      if (!isUndefOrInRange(Mask[i], 12, 16))
3317        return false;
3318  }
3319
3320  return true;
3321}
3322
3323/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3324/// is suitable for input to PSHUFLW.
3325static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3326  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3327    return false;
3328
3329  // Upper quadword copied in order.
3330  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3331    return false;
3332
3333  // Lower quadword shuffled.
3334  for (unsigned i = 0; i != 4; ++i)
3335    if (!isUndefOrInRange(Mask[i], 0, 4))
3336      return false;
3337
3338  if (VT == MVT::v16i16) {
3339    // Upper quadword copied in order.
3340    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3341      return false;
3342
3343    // Lower quadword shuffled.
3344    for (unsigned i = 8; i != 12; ++i)
3345      if (!isUndefOrInRange(Mask[i], 8, 12))
3346        return false;
3347  }
3348
3349  return true;
3350}
3351
3352/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3353/// is suitable for input to PALIGNR.
3354static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
3355                          const X86Subtarget *Subtarget) {
3356  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
3357      (VT.getSizeInBits() == 256 && !Subtarget->hasInt256()))
3358    return false;
3359
3360  unsigned NumElts = VT.getVectorNumElements();
3361  unsigned NumLanes = VT.getSizeInBits()/128;
3362  unsigned NumLaneElts = NumElts/NumLanes;
3363
3364  // Do not handle 64-bit element shuffles with palignr.
3365  if (NumLaneElts == 2)
3366    return false;
3367
3368  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3369    unsigned i;
3370    for (i = 0; i != NumLaneElts; ++i) {
3371      if (Mask[i+l] >= 0)
3372        break;
3373    }
3374
3375    // Lane is all undef, go to next lane
3376    if (i == NumLaneElts)
3377      continue;
3378
3379    int Start = Mask[i+l];
3380
3381    // Make sure its in this lane in one of the sources
3382    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3383        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3384      return false;
3385
3386    // If not lane 0, then we must match lane 0
3387    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3388      return false;
3389
3390    // Correct second source to be contiguous with first source
3391    if (Start >= (int)NumElts)
3392      Start -= NumElts - NumLaneElts;
3393
3394    // Make sure we're shifting in the right direction.
3395    if (Start <= (int)(i+l))
3396      return false;
3397
3398    Start -= i;
3399
3400    // Check the rest of the elements to see if they are consecutive.
3401    for (++i; i != NumLaneElts; ++i) {
3402      int Idx = Mask[i+l];
3403
3404      // Make sure its in this lane
3405      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3406          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3407        return false;
3408
3409      // If not lane 0, then we must match lane 0
3410      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3411        return false;
3412
3413      if (Idx >= (int)NumElts)
3414        Idx -= NumElts - NumLaneElts;
3415
3416      if (!isUndefOrEqual(Idx, Start+i))
3417        return false;
3418
3419    }
3420  }
3421
3422  return true;
3423}
3424
3425/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3426/// the two vector operands have swapped position.
3427static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3428                                     unsigned NumElems) {
3429  for (unsigned i = 0; i != NumElems; ++i) {
3430    int idx = Mask[i];
3431    if (idx < 0)
3432      continue;
3433    else if (idx < (int)NumElems)
3434      Mask[i] = idx + NumElems;
3435    else
3436      Mask[i] = idx - NumElems;
3437  }
3438}
3439
3440/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3441/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3442/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3443/// reverse of what x86 shuffles want.
3444static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
3445                        bool Commuted = false) {
3446  if (!HasFp256 && VT.getSizeInBits() == 256)
3447    return false;
3448
3449  unsigned NumElems = VT.getVectorNumElements();
3450  unsigned NumLanes = VT.getSizeInBits()/128;
3451  unsigned NumLaneElems = NumElems/NumLanes;
3452
3453  if (NumLaneElems != 2 && NumLaneElems != 4)
3454    return false;
3455
3456  // VSHUFPSY divides the resulting vector into 4 chunks.
3457  // The sources are also splitted into 4 chunks, and each destination
3458  // chunk must come from a different source chunk.
3459  //
3460  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3461  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3462  //
3463  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3464  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3465  //
3466  // VSHUFPDY divides the resulting vector into 4 chunks.
3467  // The sources are also splitted into 4 chunks, and each destination
3468  // chunk must come from a different source chunk.
3469  //
3470  //  SRC1 =>      X3       X2       X1       X0
3471  //  SRC2 =>      Y3       Y2       Y1       Y0
3472  //
3473  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3474  //
3475  unsigned HalfLaneElems = NumLaneElems/2;
3476  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3477    for (unsigned i = 0; i != NumLaneElems; ++i) {
3478      int Idx = Mask[i+l];
3479      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3480      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3481        return false;
3482      // For VSHUFPSY, the mask of the second half must be the same as the
3483      // first but with the appropriate offsets. This works in the same way as
3484      // VPERMILPS works with masks.
3485      if (NumElems != 8 || l == 0 || Mask[i] < 0)
3486        continue;
3487      if (!isUndefOrEqual(Idx, Mask[i]+l))
3488        return false;
3489    }
3490  }
3491
3492  return true;
3493}
3494
3495/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3496/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3497static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
3498  if (!VT.is128BitVector())
3499    return false;
3500
3501  unsigned NumElems = VT.getVectorNumElements();
3502
3503  if (NumElems != 4)
3504    return false;
3505
3506  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3507  return isUndefOrEqual(Mask[0], 6) &&
3508         isUndefOrEqual(Mask[1], 7) &&
3509         isUndefOrEqual(Mask[2], 2) &&
3510         isUndefOrEqual(Mask[3], 3);
3511}
3512
3513/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3514/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3515/// <2, 3, 2, 3>
3516static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
3517  if (!VT.is128BitVector())
3518    return false;
3519
3520  unsigned NumElems = VT.getVectorNumElements();
3521
3522  if (NumElems != 4)
3523    return false;
3524
3525  return isUndefOrEqual(Mask[0], 2) &&
3526         isUndefOrEqual(Mask[1], 3) &&
3527         isUndefOrEqual(Mask[2], 2) &&
3528         isUndefOrEqual(Mask[3], 3);
3529}
3530
3531/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3532/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3533static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
3534  if (!VT.is128BitVector())
3535    return false;
3536
3537  unsigned NumElems = VT.getVectorNumElements();
3538
3539  if (NumElems != 2 && NumElems != 4)
3540    return false;
3541
3542  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3543    if (!isUndefOrEqual(Mask[i], i + NumElems))
3544      return false;
3545
3546  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3547    if (!isUndefOrEqual(Mask[i], i))
3548      return false;
3549
3550  return true;
3551}
3552
3553/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3554/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3555static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
3556  if (!VT.is128BitVector())
3557    return false;
3558
3559  unsigned NumElems = VT.getVectorNumElements();
3560
3561  if (NumElems != 2 && NumElems != 4)
3562    return false;
3563
3564  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3565    if (!isUndefOrEqual(Mask[i], i))
3566      return false;
3567
3568  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3569    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3570      return false;
3571
3572  return true;
3573}
3574
3575//
3576// Some special combinations that can be optimized.
3577//
3578static
3579SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
3580                               SelectionDAG &DAG) {
3581  EVT VT = SVOp->getValueType(0);
3582  DebugLoc dl = SVOp->getDebugLoc();
3583
3584  if (VT != MVT::v8i32 && VT != MVT::v8f32)
3585    return SDValue();
3586
3587  ArrayRef<int> Mask = SVOp->getMask();
3588
3589  // These are the special masks that may be optimized.
3590  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
3591  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
3592  bool MatchEvenMask = true;
3593  bool MatchOddMask  = true;
3594  for (int i=0; i<8; ++i) {
3595    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
3596      MatchEvenMask = false;
3597    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
3598      MatchOddMask = false;
3599  }
3600
3601  if (!MatchEvenMask && !MatchOddMask)
3602    return SDValue();
3603
3604  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
3605
3606  SDValue Op0 = SVOp->getOperand(0);
3607  SDValue Op1 = SVOp->getOperand(1);
3608
3609  if (MatchEvenMask) {
3610    // Shift the second operand right to 32 bits.
3611    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
3612    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
3613  } else {
3614    // Shift the first operand left to 32 bits.
3615    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
3616    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
3617  }
3618  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
3619  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
3620}
3621
3622/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3623/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3624static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
3625                         bool HasInt256, bool V2IsSplat = false) {
3626  unsigned NumElts = VT.getVectorNumElements();
3627
3628  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3629         "Unsupported vector type for unpckh");
3630
3631  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3632      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3633    return false;
3634
3635  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3636  // independently on 128-bit lanes.
3637  unsigned NumLanes = VT.getSizeInBits()/128;
3638  unsigned NumLaneElts = NumElts/NumLanes;
3639
3640  for (unsigned l = 0; l != NumLanes; ++l) {
3641    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3642         i != (l+1)*NumLaneElts;
3643         i += 2, ++j) {
3644      int BitI  = Mask[i];
3645      int BitI1 = Mask[i+1];
3646      if (!isUndefOrEqual(BitI, j))
3647        return false;
3648      if (V2IsSplat) {
3649        if (!isUndefOrEqual(BitI1, NumElts))
3650          return false;
3651      } else {
3652        if (!isUndefOrEqual(BitI1, j + NumElts))
3653          return false;
3654      }
3655    }
3656  }
3657
3658  return true;
3659}
3660
3661/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3662/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3663static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
3664                         bool HasInt256, bool V2IsSplat = false) {
3665  unsigned NumElts = VT.getVectorNumElements();
3666
3667  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3668         "Unsupported vector type for unpckh");
3669
3670  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3671      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3672    return false;
3673
3674  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3675  // independently on 128-bit lanes.
3676  unsigned NumLanes = VT.getSizeInBits()/128;
3677  unsigned NumLaneElts = NumElts/NumLanes;
3678
3679  for (unsigned l = 0; l != NumLanes; ++l) {
3680    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3681         i != (l+1)*NumLaneElts; i += 2, ++j) {
3682      int BitI  = Mask[i];
3683      int BitI1 = Mask[i+1];
3684      if (!isUndefOrEqual(BitI, j))
3685        return false;
3686      if (V2IsSplat) {
3687        if (isUndefOrEqual(BitI1, NumElts))
3688          return false;
3689      } else {
3690        if (!isUndefOrEqual(BitI1, j+NumElts))
3691          return false;
3692      }
3693    }
3694  }
3695  return true;
3696}
3697
3698/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3699/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3700/// <0, 0, 1, 1>
3701static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
3702                                  bool HasInt256) {
3703  unsigned NumElts = VT.getVectorNumElements();
3704
3705  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3706         "Unsupported vector type for unpckh");
3707
3708  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3709      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3710    return false;
3711
3712  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
3713  // FIXME: Need a better way to get rid of this, there's no latency difference
3714  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
3715  // the former later. We should also remove the "_undef" special mask.
3716  if (NumElts == 4 && VT.getSizeInBits() == 256)
3717    return false;
3718
3719  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3720  // independently on 128-bit lanes.
3721  unsigned NumLanes = VT.getSizeInBits()/128;
3722  unsigned NumLaneElts = NumElts/NumLanes;
3723
3724  for (unsigned l = 0; l != NumLanes; ++l) {
3725    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3726         i != (l+1)*NumLaneElts;
3727         i += 2, ++j) {
3728      int BitI  = Mask[i];
3729      int BitI1 = Mask[i+1];
3730
3731      if (!isUndefOrEqual(BitI, j))
3732        return false;
3733      if (!isUndefOrEqual(BitI1, j))
3734        return false;
3735    }
3736  }
3737
3738  return true;
3739}
3740
3741/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3742/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3743/// <2, 2, 3, 3>
3744static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3745  unsigned NumElts = VT.getVectorNumElements();
3746
3747  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3748         "Unsupported vector type for unpckh");
3749
3750  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3751      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3752    return false;
3753
3754  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3755  // independently on 128-bit lanes.
3756  unsigned NumLanes = VT.getSizeInBits()/128;
3757  unsigned NumLaneElts = NumElts/NumLanes;
3758
3759  for (unsigned l = 0; l != NumLanes; ++l) {
3760    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3761         i != (l+1)*NumLaneElts; i += 2, ++j) {
3762      int BitI  = Mask[i];
3763      int BitI1 = Mask[i+1];
3764      if (!isUndefOrEqual(BitI, j))
3765        return false;
3766      if (!isUndefOrEqual(BitI1, j))
3767        return false;
3768    }
3769  }
3770  return true;
3771}
3772
3773/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3774/// specifies a shuffle of elements that is suitable for input to MOVSS,
3775/// MOVSD, and MOVD, i.e. setting the lowest element.
3776static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
3777  if (VT.getVectorElementType().getSizeInBits() < 32)
3778    return false;
3779  if (!VT.is128BitVector())
3780    return false;
3781
3782  unsigned NumElts = VT.getVectorNumElements();
3783
3784  if (!isUndefOrEqual(Mask[0], NumElts))
3785    return false;
3786
3787  for (unsigned i = 1; i != NumElts; ++i)
3788    if (!isUndefOrEqual(Mask[i], i))
3789      return false;
3790
3791  return true;
3792}
3793
3794/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
3795/// as permutations between 128-bit chunks or halves. As an example: this
3796/// shuffle bellow:
3797///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
3798/// The first half comes from the second half of V1 and the second half from the
3799/// the second half of V2.
3800static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3801  if (!HasFp256 || !VT.is256BitVector())
3802    return false;
3803
3804  // The shuffle result is divided into half A and half B. In total the two
3805  // sources have 4 halves, namely: C, D, E, F. The final values of A and
3806  // B must come from C, D, E or F.
3807  unsigned HalfSize = VT.getVectorNumElements()/2;
3808  bool MatchA = false, MatchB = false;
3809
3810  // Check if A comes from one of C, D, E, F.
3811  for (unsigned Half = 0; Half != 4; ++Half) {
3812    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
3813      MatchA = true;
3814      break;
3815    }
3816  }
3817
3818  // Check if B comes from one of C, D, E, F.
3819  for (unsigned Half = 0; Half != 4; ++Half) {
3820    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
3821      MatchB = true;
3822      break;
3823    }
3824  }
3825
3826  return MatchA && MatchB;
3827}
3828
3829/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
3830/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
3831static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
3832  EVT VT = SVOp->getValueType(0);
3833
3834  unsigned HalfSize = VT.getVectorNumElements()/2;
3835
3836  unsigned FstHalf = 0, SndHalf = 0;
3837  for (unsigned i = 0; i < HalfSize; ++i) {
3838    if (SVOp->getMaskElt(i) > 0) {
3839      FstHalf = SVOp->getMaskElt(i)/HalfSize;
3840      break;
3841    }
3842  }
3843  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
3844    if (SVOp->getMaskElt(i) > 0) {
3845      SndHalf = SVOp->getMaskElt(i)/HalfSize;
3846      break;
3847    }
3848  }
3849
3850  return (FstHalf | (SndHalf << 4));
3851}
3852
3853/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
3854/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
3855/// Note that VPERMIL mask matching is different depending whether theunderlying
3856/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3857/// to the same elements of the low, but to the higher half of the source.
3858/// In VPERMILPD the two lanes could be shuffled independently of each other
3859/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
3860static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3861  if (!HasFp256)
3862    return false;
3863
3864  unsigned NumElts = VT.getVectorNumElements();
3865  // Only match 256-bit with 32/64-bit types
3866  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
3867    return false;
3868
3869  unsigned NumLanes = VT.getSizeInBits()/128;
3870  unsigned LaneSize = NumElts/NumLanes;
3871  for (unsigned l = 0; l != NumElts; l += LaneSize) {
3872    for (unsigned i = 0; i != LaneSize; ++i) {
3873      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
3874        return false;
3875      if (NumElts != 8 || l == 0)
3876        continue;
3877      // VPERMILPS handling
3878      if (Mask[i] < 0)
3879        continue;
3880      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
3881        return false;
3882    }
3883  }
3884
3885  return true;
3886}
3887
3888/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
3889/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3890/// element of vector 2 and the other elements to come from vector 1 in order.
3891static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
3892                               bool V2IsSplat = false, bool V2IsUndef = false) {
3893  if (!VT.is128BitVector())
3894    return false;
3895
3896  unsigned NumOps = VT.getVectorNumElements();
3897  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3898    return false;
3899
3900  if (!isUndefOrEqual(Mask[0], 0))
3901    return false;
3902
3903  for (unsigned i = 1; i != NumOps; ++i)
3904    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3905          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3906          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3907      return false;
3908
3909  return true;
3910}
3911
3912/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3913/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3914/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
3915static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
3916                           const X86Subtarget *Subtarget) {
3917  if (!Subtarget->hasSSE3())
3918    return false;
3919
3920  unsigned NumElems = VT.getVectorNumElements();
3921
3922  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3923      (VT.getSizeInBits() == 256 && NumElems != 8))
3924    return false;
3925
3926  // "i+1" is the value the indexed mask element must have
3927  for (unsigned i = 0; i != NumElems; i += 2)
3928    if (!isUndefOrEqual(Mask[i], i+1) ||
3929        !isUndefOrEqual(Mask[i+1], i+1))
3930      return false;
3931
3932  return true;
3933}
3934
3935/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3936/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3937/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
3938static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
3939                           const X86Subtarget *Subtarget) {
3940  if (!Subtarget->hasSSE3())
3941    return false;
3942
3943  unsigned NumElems = VT.getVectorNumElements();
3944
3945  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3946      (VT.getSizeInBits() == 256 && NumElems != 8))
3947    return false;
3948
3949  // "i" is the value the indexed mask element must have
3950  for (unsigned i = 0; i != NumElems; i += 2)
3951    if (!isUndefOrEqual(Mask[i], i) ||
3952        !isUndefOrEqual(Mask[i+1], i))
3953      return false;
3954
3955  return true;
3956}
3957
3958/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
3959/// specifies a shuffle of elements that is suitable for input to 256-bit
3960/// version of MOVDDUP.
3961static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3962  if (!HasFp256 || !VT.is256BitVector())
3963    return false;
3964
3965  unsigned NumElts = VT.getVectorNumElements();
3966  if (NumElts != 4)
3967    return false;
3968
3969  for (unsigned i = 0; i != NumElts/2; ++i)
3970    if (!isUndefOrEqual(Mask[i], 0))
3971      return false;
3972  for (unsigned i = NumElts/2; i != NumElts; ++i)
3973    if (!isUndefOrEqual(Mask[i], NumElts/2))
3974      return false;
3975  return true;
3976}
3977
3978/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3979/// specifies a shuffle of elements that is suitable for input to 128-bit
3980/// version of MOVDDUP.
3981static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
3982  if (!VT.is128BitVector())
3983    return false;
3984
3985  unsigned e = VT.getVectorNumElements() / 2;
3986  for (unsigned i = 0; i != e; ++i)
3987    if (!isUndefOrEqual(Mask[i], i))
3988      return false;
3989  for (unsigned i = 0; i != e; ++i)
3990    if (!isUndefOrEqual(Mask[e+i], i))
3991      return false;
3992  return true;
3993}
3994
3995/// isVEXTRACTF128Index - Return true if the specified
3996/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
3997/// suitable for input to VEXTRACTF128.
3998bool X86::isVEXTRACTF128Index(SDNode *N) {
3999  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4000    return false;
4001
4002  // The index should be aligned on a 128-bit boundary.
4003  uint64_t Index =
4004    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4005
4006  unsigned VL = N->getValueType(0).getVectorNumElements();
4007  unsigned VBits = N->getValueType(0).getSizeInBits();
4008  unsigned ElSize = VBits / VL;
4009  bool Result = (Index * ElSize) % 128 == 0;
4010
4011  return Result;
4012}
4013
4014/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
4015/// operand specifies a subvector insert that is suitable for input to
4016/// VINSERTF128.
4017bool X86::isVINSERTF128Index(SDNode *N) {
4018  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4019    return false;
4020
4021  // The index should be aligned on a 128-bit boundary.
4022  uint64_t Index =
4023    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4024
4025  unsigned VL = N->getValueType(0).getVectorNumElements();
4026  unsigned VBits = N->getValueType(0).getSizeInBits();
4027  unsigned ElSize = VBits / VL;
4028  bool Result = (Index * ElSize) % 128 == 0;
4029
4030  return Result;
4031}
4032
4033/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4034/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4035/// Handles 128-bit and 256-bit.
4036static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4037  EVT VT = N->getValueType(0);
4038
4039  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4040         "Unsupported vector type for PSHUF/SHUFP");
4041
4042  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4043  // independently on 128-bit lanes.
4044  unsigned NumElts = VT.getVectorNumElements();
4045  unsigned NumLanes = VT.getSizeInBits()/128;
4046  unsigned NumLaneElts = NumElts/NumLanes;
4047
4048  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
4049         "Only supports 2 or 4 elements per lane");
4050
4051  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
4052  unsigned Mask = 0;
4053  for (unsigned i = 0; i != NumElts; ++i) {
4054    int Elt = N->getMaskElt(i);
4055    if (Elt < 0) continue;
4056    Elt &= NumLaneElts - 1;
4057    unsigned ShAmt = (i << Shift) % 8;
4058    Mask |= Elt << ShAmt;
4059  }
4060
4061  return Mask;
4062}
4063
4064/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4065/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4066static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4067  EVT VT = N->getValueType(0);
4068
4069  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4070         "Unsupported vector type for PSHUFHW");
4071
4072  unsigned NumElts = VT.getVectorNumElements();
4073
4074  unsigned Mask = 0;
4075  for (unsigned l = 0; l != NumElts; l += 8) {
4076    // 8 nodes per lane, but we only care about the last 4.
4077    for (unsigned i = 0; i < 4; ++i) {
4078      int Elt = N->getMaskElt(l+i+4);
4079      if (Elt < 0) continue;
4080      Elt &= 0x3; // only 2-bits.
4081      Mask |= Elt << (i * 2);
4082    }
4083  }
4084
4085  return Mask;
4086}
4087
4088/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4089/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4090static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4091  EVT VT = N->getValueType(0);
4092
4093  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4094         "Unsupported vector type for PSHUFHW");
4095
4096  unsigned NumElts = VT.getVectorNumElements();
4097
4098  unsigned Mask = 0;
4099  for (unsigned l = 0; l != NumElts; l += 8) {
4100    // 8 nodes per lane, but we only care about the first 4.
4101    for (unsigned i = 0; i < 4; ++i) {
4102      int Elt = N->getMaskElt(l+i);
4103      if (Elt < 0) continue;
4104      Elt &= 0x3; // only 2-bits
4105      Mask |= Elt << (i * 2);
4106    }
4107  }
4108
4109  return Mask;
4110}
4111
4112/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
4113/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
4114static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4115  EVT VT = SVOp->getValueType(0);
4116  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
4117
4118  unsigned NumElts = VT.getVectorNumElements();
4119  unsigned NumLanes = VT.getSizeInBits()/128;
4120  unsigned NumLaneElts = NumElts/NumLanes;
4121
4122  int Val = 0;
4123  unsigned i;
4124  for (i = 0; i != NumElts; ++i) {
4125    Val = SVOp->getMaskElt(i);
4126    if (Val >= 0)
4127      break;
4128  }
4129  if (Val >= (int)NumElts)
4130    Val -= NumElts - NumLaneElts;
4131
4132  assert(Val - i > 0 && "PALIGNR imm should be positive");
4133  return (Val - i) * EltSize;
4134}
4135
4136/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
4137/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4138/// instructions.
4139unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
4140  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4141    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
4142
4143  uint64_t Index =
4144    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4145
4146  EVT VecVT = N->getOperand(0).getValueType();
4147  EVT ElVT = VecVT.getVectorElementType();
4148
4149  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4150  return Index / NumElemsPerChunk;
4151}
4152
4153/// getInsertVINSERTF128Immediate - Return the appropriate immediate
4154/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4155/// instructions.
4156unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
4157  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4158    llvm_unreachable("Illegal insert subvector for VINSERTF128");
4159
4160  uint64_t Index =
4161    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4162
4163  EVT VecVT = N->getValueType(0);
4164  EVT ElVT = VecVT.getVectorElementType();
4165
4166  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4167  return Index / NumElemsPerChunk;
4168}
4169
4170/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
4171/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
4172/// Handles 256-bit.
4173static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
4174  EVT VT = N->getValueType(0);
4175
4176  unsigned NumElts = VT.getVectorNumElements();
4177
4178  assert((VT.is256BitVector() && NumElts == 4) &&
4179         "Unsupported vector type for VPERMQ/VPERMPD");
4180
4181  unsigned Mask = 0;
4182  for (unsigned i = 0; i != NumElts; ++i) {
4183    int Elt = N->getMaskElt(i);
4184    if (Elt < 0)
4185      continue;
4186    Mask |= Elt << (i*2);
4187  }
4188
4189  return Mask;
4190}
4191/// isZeroNode - Returns true if Elt is a constant zero or a floating point
4192/// constant +0.0.
4193bool X86::isZeroNode(SDValue Elt) {
4194  return ((isa<ConstantSDNode>(Elt) &&
4195           cast<ConstantSDNode>(Elt)->isNullValue()) ||
4196          (isa<ConstantFPSDNode>(Elt) &&
4197           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
4198}
4199
4200/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4201/// their permute mask.
4202static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
4203                                    SelectionDAG &DAG) {
4204  EVT VT = SVOp->getValueType(0);
4205  unsigned NumElems = VT.getVectorNumElements();
4206  SmallVector<int, 8> MaskVec;
4207
4208  for (unsigned i = 0; i != NumElems; ++i) {
4209    int Idx = SVOp->getMaskElt(i);
4210    if (Idx >= 0) {
4211      if (Idx < (int)NumElems)
4212        Idx += NumElems;
4213      else
4214        Idx -= NumElems;
4215    }
4216    MaskVec.push_back(Idx);
4217  }
4218  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
4219                              SVOp->getOperand(0), &MaskVec[0]);
4220}
4221
4222/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4223/// match movhlps. The lower half elements should come from upper half of
4224/// V1 (and in order), and the upper half elements should come from the upper
4225/// half of V2 (and in order).
4226static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
4227  if (!VT.is128BitVector())
4228    return false;
4229  if (VT.getVectorNumElements() != 4)
4230    return false;
4231  for (unsigned i = 0, e = 2; i != e; ++i)
4232    if (!isUndefOrEqual(Mask[i], i+2))
4233      return false;
4234  for (unsigned i = 2; i != 4; ++i)
4235    if (!isUndefOrEqual(Mask[i], i+4))
4236      return false;
4237  return true;
4238}
4239
4240/// isScalarLoadToVector - Returns true if the node is a scalar load that
4241/// is promoted to a vector. It also returns the LoadSDNode by reference if
4242/// required.
4243static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4244  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4245    return false;
4246  N = N->getOperand(0).getNode();
4247  if (!ISD::isNON_EXTLoad(N))
4248    return false;
4249  if (LD)
4250    *LD = cast<LoadSDNode>(N);
4251  return true;
4252}
4253
4254// Test whether the given value is a vector value which will be legalized
4255// into a load.
4256static bool WillBeConstantPoolLoad(SDNode *N) {
4257  if (N->getOpcode() != ISD::BUILD_VECTOR)
4258    return false;
4259
4260  // Check for any non-constant elements.
4261  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4262    switch (N->getOperand(i).getNode()->getOpcode()) {
4263    case ISD::UNDEF:
4264    case ISD::ConstantFP:
4265    case ISD::Constant:
4266      break;
4267    default:
4268      return false;
4269    }
4270
4271  // Vectors of all-zeros and all-ones are materialized with special
4272  // instructions rather than being loaded.
4273  return !ISD::isBuildVectorAllZeros(N) &&
4274         !ISD::isBuildVectorAllOnes(N);
4275}
4276
4277/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4278/// match movlp{s|d}. The lower half elements should come from lower half of
4279/// V1 (and in order), and the upper half elements should come from the upper
4280/// half of V2 (and in order). And since V1 will become the source of the
4281/// MOVLP, it must be either a vector load or a scalar load to vector.
4282static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
4283                               ArrayRef<int> Mask, EVT VT) {
4284  if (!VT.is128BitVector())
4285    return false;
4286
4287  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4288    return false;
4289  // Is V2 is a vector load, don't do this transformation. We will try to use
4290  // load folding shufps op.
4291  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
4292    return false;
4293
4294  unsigned NumElems = VT.getVectorNumElements();
4295
4296  if (NumElems != 2 && NumElems != 4)
4297    return false;
4298  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4299    if (!isUndefOrEqual(Mask[i], i))
4300      return false;
4301  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4302    if (!isUndefOrEqual(Mask[i], i+NumElems))
4303      return false;
4304  return true;
4305}
4306
4307/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4308/// all the same.
4309static bool isSplatVector(SDNode *N) {
4310  if (N->getOpcode() != ISD::BUILD_VECTOR)
4311    return false;
4312
4313  SDValue SplatValue = N->getOperand(0);
4314  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4315    if (N->getOperand(i) != SplatValue)
4316      return false;
4317  return true;
4318}
4319
4320/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4321/// to an zero vector.
4322/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4323static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4324  SDValue V1 = N->getOperand(0);
4325  SDValue V2 = N->getOperand(1);
4326  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4327  for (unsigned i = 0; i != NumElems; ++i) {
4328    int Idx = N->getMaskElt(i);
4329    if (Idx >= (int)NumElems) {
4330      unsigned Opc = V2.getOpcode();
4331      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4332        continue;
4333      if (Opc != ISD::BUILD_VECTOR ||
4334          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4335        return false;
4336    } else if (Idx >= 0) {
4337      unsigned Opc = V1.getOpcode();
4338      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4339        continue;
4340      if (Opc != ISD::BUILD_VECTOR ||
4341          !X86::isZeroNode(V1.getOperand(Idx)))
4342        return false;
4343    }
4344  }
4345  return true;
4346}
4347
4348/// getZeroVector - Returns a vector of specified type with all zero elements.
4349///
4350static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4351                             SelectionDAG &DAG, DebugLoc dl) {
4352  assert(VT.isVector() && "Expected a vector type");
4353  unsigned Size = VT.getSizeInBits();
4354
4355  // Always build SSE zero vectors as <4 x i32> bitcasted
4356  // to their dest type. This ensures they get CSE'd.
4357  SDValue Vec;
4358  if (Size == 128) {  // SSE
4359    if (Subtarget->hasSSE2()) {  // SSE2
4360      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4361      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4362    } else { // SSE1
4363      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4364      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4365    }
4366  } else if (Size == 256) { // AVX
4367    if (Subtarget->hasInt256()) { // AVX2
4368      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4369      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4370      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4371    } else {
4372      // 256-bit logic and arithmetic instructions in AVX are all
4373      // floating-point, no support for integer ops. Emit fp zeroed vectors.
4374      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4375      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4376      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
4377    }
4378  } else
4379    llvm_unreachable("Unexpected vector type");
4380
4381  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4382}
4383
4384/// getOnesVector - Returns a vector of specified type with all bits set.
4385/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4386/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4387/// Then bitcast to their original type, ensuring they get CSE'd.
4388static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
4389                             DebugLoc dl) {
4390  assert(VT.isVector() && "Expected a vector type");
4391  unsigned Size = VT.getSizeInBits();
4392
4393  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4394  SDValue Vec;
4395  if (Size == 256) {
4396    if (HasInt256) { // AVX2
4397      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4398      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4399    } else { // AVX
4400      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4401      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4402    }
4403  } else if (Size == 128) {
4404    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4405  } else
4406    llvm_unreachable("Unexpected vector type");
4407
4408  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4409}
4410
4411/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4412/// that point to V2 points to its first element.
4413static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4414  for (unsigned i = 0; i != NumElems; ++i) {
4415    if (Mask[i] > (int)NumElems) {
4416      Mask[i] = NumElems;
4417    }
4418  }
4419}
4420
4421/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4422/// operation of specified width.
4423static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4424                       SDValue V2) {
4425  unsigned NumElems = VT.getVectorNumElements();
4426  SmallVector<int, 8> Mask;
4427  Mask.push_back(NumElems);
4428  for (unsigned i = 1; i != NumElems; ++i)
4429    Mask.push_back(i);
4430  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4431}
4432
4433/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4434static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4435                          SDValue V2) {
4436  unsigned NumElems = VT.getVectorNumElements();
4437  SmallVector<int, 8> Mask;
4438  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4439    Mask.push_back(i);
4440    Mask.push_back(i + NumElems);
4441  }
4442  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4443}
4444
4445/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4446static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4447                          SDValue V2) {
4448  unsigned NumElems = VT.getVectorNumElements();
4449  SmallVector<int, 8> Mask;
4450  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4451    Mask.push_back(i + Half);
4452    Mask.push_back(i + NumElems + Half);
4453  }
4454  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4455}
4456
4457// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4458// a generic shuffle instruction because the target has no such instructions.
4459// Generate shuffles which repeat i16 and i8 several times until they can be
4460// represented by v4f32 and then be manipulated by target suported shuffles.
4461static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4462  EVT VT = V.getValueType();
4463  int NumElems = VT.getVectorNumElements();
4464  DebugLoc dl = V.getDebugLoc();
4465
4466  while (NumElems > 4) {
4467    if (EltNo < NumElems/2) {
4468      V = getUnpackl(DAG, dl, VT, V, V);
4469    } else {
4470      V = getUnpackh(DAG, dl, VT, V, V);
4471      EltNo -= NumElems/2;
4472    }
4473    NumElems >>= 1;
4474  }
4475  return V;
4476}
4477
4478/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4479static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4480  EVT VT = V.getValueType();
4481  DebugLoc dl = V.getDebugLoc();
4482  unsigned Size = VT.getSizeInBits();
4483
4484  if (Size == 128) {
4485    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4486    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4487    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4488                             &SplatMask[0]);
4489  } else if (Size == 256) {
4490    // To use VPERMILPS to splat scalars, the second half of indicies must
4491    // refer to the higher part, which is a duplication of the lower one,
4492    // because VPERMILPS can only handle in-lane permutations.
4493    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4494                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4495
4496    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4497    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4498                             &SplatMask[0]);
4499  } else
4500    llvm_unreachable("Vector size not supported");
4501
4502  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4503}
4504
4505/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4506static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4507  EVT SrcVT = SV->getValueType(0);
4508  SDValue V1 = SV->getOperand(0);
4509  DebugLoc dl = SV->getDebugLoc();
4510
4511  int EltNo = SV->getSplatIndex();
4512  int NumElems = SrcVT.getVectorNumElements();
4513  unsigned Size = SrcVT.getSizeInBits();
4514
4515  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
4516          "Unknown how to promote splat for type");
4517
4518  // Extract the 128-bit part containing the splat element and update
4519  // the splat element index when it refers to the higher register.
4520  if (Size == 256) {
4521    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4522    if (EltNo >= NumElems/2)
4523      EltNo -= NumElems/2;
4524  }
4525
4526  // All i16 and i8 vector types can't be used directly by a generic shuffle
4527  // instruction because the target has no such instruction. Generate shuffles
4528  // which repeat i16 and i8 several times until they fit in i32, and then can
4529  // be manipulated by target suported shuffles.
4530  EVT EltVT = SrcVT.getVectorElementType();
4531  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4532    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4533
4534  // Recreate the 256-bit vector and place the same 128-bit vector
4535  // into the low and high part. This is necessary because we want
4536  // to use VPERM* to shuffle the vectors
4537  if (Size == 256) {
4538    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4539  }
4540
4541  return getLegalSplat(DAG, V1, EltNo);
4542}
4543
4544/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4545/// vector of zero or undef vector.  This produces a shuffle where the low
4546/// element of V2 is swizzled into the zero/undef vector, landing at element
4547/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4548static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4549                                           bool IsZero,
4550                                           const X86Subtarget *Subtarget,
4551                                           SelectionDAG &DAG) {
4552  EVT VT = V2.getValueType();
4553  SDValue V1 = IsZero
4554    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
4555  unsigned NumElems = VT.getVectorNumElements();
4556  SmallVector<int, 16> MaskVec;
4557  for (unsigned i = 0; i != NumElems; ++i)
4558    // If this is the insertion idx, put the low elt of V2 here.
4559    MaskVec.push_back(i == Idx ? NumElems : i);
4560  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
4561}
4562
4563/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4564/// target specific opcode. Returns true if the Mask could be calculated.
4565/// Sets IsUnary to true if only uses one source.
4566static bool getTargetShuffleMask(SDNode *N, MVT VT,
4567                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4568  unsigned NumElems = VT.getVectorNumElements();
4569  SDValue ImmN;
4570
4571  IsUnary = false;
4572  switch(N->getOpcode()) {
4573  case X86ISD::SHUFP:
4574    ImmN = N->getOperand(N->getNumOperands()-1);
4575    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4576    break;
4577  case X86ISD::UNPCKH:
4578    DecodeUNPCKHMask(VT, Mask);
4579    break;
4580  case X86ISD::UNPCKL:
4581    DecodeUNPCKLMask(VT, Mask);
4582    break;
4583  case X86ISD::MOVHLPS:
4584    DecodeMOVHLPSMask(NumElems, Mask);
4585    break;
4586  case X86ISD::MOVLHPS:
4587    DecodeMOVLHPSMask(NumElems, Mask);
4588    break;
4589  case X86ISD::PSHUFD:
4590  case X86ISD::VPERMILP:
4591    ImmN = N->getOperand(N->getNumOperands()-1);
4592    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4593    IsUnary = true;
4594    break;
4595  case X86ISD::PSHUFHW:
4596    ImmN = N->getOperand(N->getNumOperands()-1);
4597    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4598    IsUnary = true;
4599    break;
4600  case X86ISD::PSHUFLW:
4601    ImmN = N->getOperand(N->getNumOperands()-1);
4602    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4603    IsUnary = true;
4604    break;
4605  case X86ISD::VPERMI:
4606    ImmN = N->getOperand(N->getNumOperands()-1);
4607    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4608    IsUnary = true;
4609    break;
4610  case X86ISD::MOVSS:
4611  case X86ISD::MOVSD: {
4612    // The index 0 always comes from the first element of the second source,
4613    // this is why MOVSS and MOVSD are used in the first place. The other
4614    // elements come from the other positions of the first source vector
4615    Mask.push_back(NumElems);
4616    for (unsigned i = 1; i != NumElems; ++i) {
4617      Mask.push_back(i);
4618    }
4619    break;
4620  }
4621  case X86ISD::VPERM2X128:
4622    ImmN = N->getOperand(N->getNumOperands()-1);
4623    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4624    if (Mask.empty()) return false;
4625    break;
4626  case X86ISD::MOVDDUP:
4627  case X86ISD::MOVLHPD:
4628  case X86ISD::MOVLPD:
4629  case X86ISD::MOVLPS:
4630  case X86ISD::MOVSHDUP:
4631  case X86ISD::MOVSLDUP:
4632  case X86ISD::PALIGN:
4633    // Not yet implemented
4634    return false;
4635  default: llvm_unreachable("unknown target shuffle node");
4636  }
4637
4638  return true;
4639}
4640
4641/// getShuffleScalarElt - Returns the scalar element that will make up the ith
4642/// element of the result of the vector shuffle.
4643static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
4644                                   unsigned Depth) {
4645  if (Depth == 6)
4646    return SDValue();  // Limit search depth.
4647
4648  SDValue V = SDValue(N, 0);
4649  EVT VT = V.getValueType();
4650  unsigned Opcode = V.getOpcode();
4651
4652  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4653  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4654    int Elt = SV->getMaskElt(Index);
4655
4656    if (Elt < 0)
4657      return DAG.getUNDEF(VT.getVectorElementType());
4658
4659    unsigned NumElems = VT.getVectorNumElements();
4660    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
4661                                         : SV->getOperand(1);
4662    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
4663  }
4664
4665  // Recurse into target specific vector shuffles to find scalars.
4666  if (isTargetShuffle(Opcode)) {
4667    MVT ShufVT = V.getValueType().getSimpleVT();
4668    unsigned NumElems = ShufVT.getVectorNumElements();
4669    SmallVector<int, 16> ShuffleMask;
4670    bool IsUnary;
4671
4672    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
4673      return SDValue();
4674
4675    int Elt = ShuffleMask[Index];
4676    if (Elt < 0)
4677      return DAG.getUNDEF(ShufVT.getVectorElementType());
4678
4679    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
4680                                         : N->getOperand(1);
4681    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
4682                               Depth+1);
4683  }
4684
4685  // Actual nodes that may contain scalar elements
4686  if (Opcode == ISD::BITCAST) {
4687    V = V.getOperand(0);
4688    EVT SrcVT = V.getValueType();
4689    unsigned NumElems = VT.getVectorNumElements();
4690
4691    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4692      return SDValue();
4693  }
4694
4695  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4696    return (Index == 0) ? V.getOperand(0)
4697                        : DAG.getUNDEF(VT.getVectorElementType());
4698
4699  if (V.getOpcode() == ISD::BUILD_VECTOR)
4700    return V.getOperand(Index);
4701
4702  return SDValue();
4703}
4704
4705/// getNumOfConsecutiveZeros - Return the number of elements of a vector
4706/// shuffle operation which come from a consecutively from a zero. The
4707/// search can start in two different directions, from left or right.
4708static
4709unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
4710                                  bool ZerosFromLeft, SelectionDAG &DAG) {
4711  unsigned i;
4712  for (i = 0; i != NumElems; ++i) {
4713    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
4714    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
4715    if (!(Elt.getNode() &&
4716         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
4717      break;
4718  }
4719
4720  return i;
4721}
4722
4723/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
4724/// correspond consecutively to elements from one of the vector operands,
4725/// starting from its index OpIdx. Also tell OpNum which source vector operand.
4726static
4727bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
4728                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
4729                              unsigned NumElems, unsigned &OpNum) {
4730  bool SeenV1 = false;
4731  bool SeenV2 = false;
4732
4733  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
4734    int Idx = SVOp->getMaskElt(i);
4735    // Ignore undef indicies
4736    if (Idx < 0)
4737      continue;
4738
4739    if (Idx < (int)NumElems)
4740      SeenV1 = true;
4741    else
4742      SeenV2 = true;
4743
4744    // Only accept consecutive elements from the same vector
4745    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
4746      return false;
4747  }
4748
4749  OpNum = SeenV1 ? 0 : 1;
4750  return true;
4751}
4752
4753/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
4754/// logical left shift of a vector.
4755static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4756                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4757  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4758  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4759              false /* check zeros from right */, DAG);
4760  unsigned OpSrc;
4761
4762  if (!NumZeros)
4763    return false;
4764
4765  // Considering the elements in the mask that are not consecutive zeros,
4766  // check if they consecutively come from only one of the source vectors.
4767  //
4768  //               V1 = {X, A, B, C}     0
4769  //                         \  \  \    /
4770  //   vector_shuffle V1, V2 <1, 2, 3, X>
4771  //
4772  if (!isShuffleMaskConsecutive(SVOp,
4773            0,                   // Mask Start Index
4774            NumElems-NumZeros,   // Mask End Index(exclusive)
4775            NumZeros,            // Where to start looking in the src vector
4776            NumElems,            // Number of elements in vector
4777            OpSrc))              // Which source operand ?
4778    return false;
4779
4780  isLeft = false;
4781  ShAmt = NumZeros;
4782  ShVal = SVOp->getOperand(OpSrc);
4783  return true;
4784}
4785
4786/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
4787/// logical left shift of a vector.
4788static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4789                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4790  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4791  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4792              true /* check zeros from left */, DAG);
4793  unsigned OpSrc;
4794
4795  if (!NumZeros)
4796    return false;
4797
4798  // Considering the elements in the mask that are not consecutive zeros,
4799  // check if they consecutively come from only one of the source vectors.
4800  //
4801  //                           0    { A, B, X, X } = V2
4802  //                          / \    /  /
4803  //   vector_shuffle V1, V2 <X, X, 4, 5>
4804  //
4805  if (!isShuffleMaskConsecutive(SVOp,
4806            NumZeros,     // Mask Start Index
4807            NumElems,     // Mask End Index(exclusive)
4808            0,            // Where to start looking in the src vector
4809            NumElems,     // Number of elements in vector
4810            OpSrc))       // Which source operand ?
4811    return false;
4812
4813  isLeft = true;
4814  ShAmt = NumZeros;
4815  ShVal = SVOp->getOperand(OpSrc);
4816  return true;
4817}
4818
4819/// isVectorShift - Returns true if the shuffle can be implemented as a
4820/// logical left or right shift of a vector.
4821static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4822                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4823  // Although the logic below support any bitwidth size, there are no
4824  // shift instructions which handle more than 128-bit vectors.
4825  if (!SVOp->getValueType(0).is128BitVector())
4826    return false;
4827
4828  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
4829      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
4830    return true;
4831
4832  return false;
4833}
4834
4835/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4836///
4837static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4838                                       unsigned NumNonZero, unsigned NumZero,
4839                                       SelectionDAG &DAG,
4840                                       const X86Subtarget* Subtarget,
4841                                       const TargetLowering &TLI) {
4842  if (NumNonZero > 8)
4843    return SDValue();
4844
4845  DebugLoc dl = Op.getDebugLoc();
4846  SDValue V(0, 0);
4847  bool First = true;
4848  for (unsigned i = 0; i < 16; ++i) {
4849    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4850    if (ThisIsNonZero && First) {
4851      if (NumZero)
4852        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4853      else
4854        V = DAG.getUNDEF(MVT::v8i16);
4855      First = false;
4856    }
4857
4858    if ((i & 1) != 0) {
4859      SDValue ThisElt(0, 0), LastElt(0, 0);
4860      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4861      if (LastIsNonZero) {
4862        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4863                              MVT::i16, Op.getOperand(i-1));
4864      }
4865      if (ThisIsNonZero) {
4866        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4867        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4868                              ThisElt, DAG.getConstant(8, MVT::i8));
4869        if (LastIsNonZero)
4870          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4871      } else
4872        ThisElt = LastElt;
4873
4874      if (ThisElt.getNode())
4875        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4876                        DAG.getIntPtrConstant(i/2));
4877    }
4878  }
4879
4880  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
4881}
4882
4883/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4884///
4885static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4886                                     unsigned NumNonZero, unsigned NumZero,
4887                                     SelectionDAG &DAG,
4888                                     const X86Subtarget* Subtarget,
4889                                     const TargetLowering &TLI) {
4890  if (NumNonZero > 4)
4891    return SDValue();
4892
4893  DebugLoc dl = Op.getDebugLoc();
4894  SDValue V(0, 0);
4895  bool First = true;
4896  for (unsigned i = 0; i < 8; ++i) {
4897    bool isNonZero = (NonZeros & (1 << i)) != 0;
4898    if (isNonZero) {
4899      if (First) {
4900        if (NumZero)
4901          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4902        else
4903          V = DAG.getUNDEF(MVT::v8i16);
4904        First = false;
4905      }
4906      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4907                      MVT::v8i16, V, Op.getOperand(i),
4908                      DAG.getIntPtrConstant(i));
4909    }
4910  }
4911
4912  return V;
4913}
4914
4915/// getVShift - Return a vector logical shift node.
4916///
4917static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4918                         unsigned NumBits, SelectionDAG &DAG,
4919                         const TargetLowering &TLI, DebugLoc dl) {
4920  assert(VT.is128BitVector() && "Unknown type for VShift");
4921  EVT ShVT = MVT::v2i64;
4922  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
4923  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
4924  return DAG.getNode(ISD::BITCAST, dl, VT,
4925                     DAG.getNode(Opc, dl, ShVT, SrcOp,
4926                             DAG.getConstant(NumBits,
4927                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
4928}
4929
4930SDValue
4931X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
4932                                          SelectionDAG &DAG) const {
4933
4934  // Check if the scalar load can be widened into a vector load. And if
4935  // the address is "base + cst" see if the cst can be "absorbed" into
4936  // the shuffle mask.
4937  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4938    SDValue Ptr = LD->getBasePtr();
4939    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4940      return SDValue();
4941    EVT PVT = LD->getValueType(0);
4942    if (PVT != MVT::i32 && PVT != MVT::f32)
4943      return SDValue();
4944
4945    int FI = -1;
4946    int64_t Offset = 0;
4947    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4948      FI = FINode->getIndex();
4949      Offset = 0;
4950    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
4951               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4952      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4953      Offset = Ptr.getConstantOperandVal(1);
4954      Ptr = Ptr.getOperand(0);
4955    } else {
4956      return SDValue();
4957    }
4958
4959    // FIXME: 256-bit vector instructions don't require a strict alignment,
4960    // improve this code to support it better.
4961    unsigned RequiredAlign = VT.getSizeInBits()/8;
4962    SDValue Chain = LD->getChain();
4963    // Make sure the stack object alignment is at least 16 or 32.
4964    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4965    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
4966      if (MFI->isFixedObjectIndex(FI)) {
4967        // Can't change the alignment. FIXME: It's possible to compute
4968        // the exact stack offset and reference FI + adjust offset instead.
4969        // If someone *really* cares about this. That's the way to implement it.
4970        return SDValue();
4971      } else {
4972        MFI->setObjectAlignment(FI, RequiredAlign);
4973      }
4974    }
4975
4976    // (Offset % 16 or 32) must be multiple of 4. Then address is then
4977    // Ptr + (Offset & ~15).
4978    if (Offset < 0)
4979      return SDValue();
4980    if ((Offset % RequiredAlign) & 3)
4981      return SDValue();
4982    int64_t StartOffset = Offset & ~(RequiredAlign-1);
4983    if (StartOffset)
4984      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
4985                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
4986
4987    int EltNo = (Offset - StartOffset) >> 2;
4988    unsigned NumElems = VT.getVectorNumElements();
4989
4990    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
4991    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
4992                             LD->getPointerInfo().getWithOffset(StartOffset),
4993                             false, false, false, 0);
4994
4995    SmallVector<int, 8> Mask;
4996    for (unsigned i = 0; i != NumElems; ++i)
4997      Mask.push_back(EltNo);
4998
4999    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
5000  }
5001
5002  return SDValue();
5003}
5004
5005/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
5006/// vector of type 'VT', see if the elements can be replaced by a single large
5007/// load which has the same value as a build_vector whose operands are 'elts'.
5008///
5009/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
5010///
5011/// FIXME: we'd also like to handle the case where the last elements are zero
5012/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
5013/// There's even a handy isZeroNode for that purpose.
5014static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
5015                                        DebugLoc &DL, SelectionDAG &DAG) {
5016  EVT EltVT = VT.getVectorElementType();
5017  unsigned NumElems = Elts.size();
5018
5019  LoadSDNode *LDBase = NULL;
5020  unsigned LastLoadedElt = -1U;
5021
5022  // For each element in the initializer, see if we've found a load or an undef.
5023  // If we don't find an initial load element, or later load elements are
5024  // non-consecutive, bail out.
5025  for (unsigned i = 0; i < NumElems; ++i) {
5026    SDValue Elt = Elts[i];
5027
5028    if (!Elt.getNode() ||
5029        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
5030      return SDValue();
5031    if (!LDBase) {
5032      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
5033        return SDValue();
5034      LDBase = cast<LoadSDNode>(Elt.getNode());
5035      LastLoadedElt = i;
5036      continue;
5037    }
5038    if (Elt.getOpcode() == ISD::UNDEF)
5039      continue;
5040
5041    LoadSDNode *LD = cast<LoadSDNode>(Elt);
5042    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
5043      return SDValue();
5044    LastLoadedElt = i;
5045  }
5046
5047  // If we have found an entire vector of loads and undefs, then return a large
5048  // load of the entire vector width starting at the base pointer.  If we found
5049  // consecutive loads for the low half, generate a vzext_load node.
5050  if (LastLoadedElt == NumElems - 1) {
5051    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
5052      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5053                         LDBase->getPointerInfo(),
5054                         LDBase->isVolatile(), LDBase->isNonTemporal(),
5055                         LDBase->isInvariant(), 0);
5056    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5057                       LDBase->getPointerInfo(),
5058                       LDBase->isVolatile(), LDBase->isNonTemporal(),
5059                       LDBase->isInvariant(), LDBase->getAlignment());
5060  }
5061  if (NumElems == 4 && LastLoadedElt == 1 &&
5062      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
5063    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
5064    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5065    SDValue ResNode =
5066        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
5067                                LDBase->getPointerInfo(),
5068                                LDBase->getAlignment(),
5069                                false/*isVolatile*/, true/*ReadMem*/,
5070                                false/*WriteMem*/);
5071
5072    // Make sure the newly-created LOAD is in the same position as LDBase in
5073    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
5074    // update uses of LDBase's output chain to use the TokenFactor.
5075    if (LDBase->hasAnyUseOfValue(1)) {
5076      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5077                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
5078      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5079      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5080                             SDValue(ResNode.getNode(), 1));
5081    }
5082
5083    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
5084  }
5085  return SDValue();
5086}
5087
5088/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
5089/// to generate a splat value for the following cases:
5090/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5091/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5092/// a scalar load, or a constant.
5093/// The VBROADCAST node is returned when a pattern is found,
5094/// or SDValue() otherwise.
5095SDValue
5096X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
5097  if (!Subtarget->hasFp256())
5098    return SDValue();
5099
5100  EVT VT = Op.getValueType();
5101  DebugLoc dl = Op.getDebugLoc();
5102
5103  assert((VT.is128BitVector() || VT.is256BitVector()) &&
5104         "Unsupported vector type for broadcast.");
5105
5106  SDValue Ld;
5107  bool ConstSplatVal;
5108
5109  switch (Op.getOpcode()) {
5110    default:
5111      // Unknown pattern found.
5112      return SDValue();
5113
5114    case ISD::BUILD_VECTOR: {
5115      // The BUILD_VECTOR node must be a splat.
5116      if (!isSplatVector(Op.getNode()))
5117        return SDValue();
5118
5119      Ld = Op.getOperand(0);
5120      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5121                     Ld.getOpcode() == ISD::ConstantFP);
5122
5123      // The suspected load node has several users. Make sure that all
5124      // of its users are from the BUILD_VECTOR node.
5125      // Constants may have multiple users.
5126      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
5127        return SDValue();
5128      break;
5129    }
5130
5131    case ISD::VECTOR_SHUFFLE: {
5132      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5133
5134      // Shuffles must have a splat mask where the first element is
5135      // broadcasted.
5136      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5137        return SDValue();
5138
5139      SDValue Sc = Op.getOperand(0);
5140      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5141          Sc.getOpcode() != ISD::BUILD_VECTOR) {
5142
5143        if (!Subtarget->hasInt256())
5144          return SDValue();
5145
5146        // Use the register form of the broadcast instruction available on AVX2.
5147        if (VT.is256BitVector())
5148          Sc = Extract128BitVector(Sc, 0, DAG, dl);
5149        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5150      }
5151
5152      Ld = Sc.getOperand(0);
5153      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5154                       Ld.getOpcode() == ISD::ConstantFP);
5155
5156      // The scalar_to_vector node and the suspected
5157      // load node must have exactly one user.
5158      // Constants may have multiple users.
5159      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
5160        return SDValue();
5161      break;
5162    }
5163  }
5164
5165  bool Is256 = VT.is256BitVector();
5166
5167  // Handle the broadcasting a single constant scalar from the constant pool
5168  // into a vector. On Sandybridge it is still better to load a constant vector
5169  // from the constant pool and not to broadcast it from a scalar.
5170  if (ConstSplatVal && Subtarget->hasInt256()) {
5171    EVT CVT = Ld.getValueType();
5172    assert(!CVT.isVector() && "Must not broadcast a vector type");
5173    unsigned ScalarSize = CVT.getSizeInBits();
5174
5175    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
5176      const Constant *C = 0;
5177      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5178        C = CI->getConstantIntValue();
5179      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5180        C = CF->getConstantFPValue();
5181
5182      assert(C && "Invalid constant type");
5183
5184      SDValue CP = DAG.getConstantPool(C, getPointerTy());
5185      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5186      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5187                       MachinePointerInfo::getConstantPool(),
5188                       false, false, false, Alignment);
5189
5190      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5191    }
5192  }
5193
5194  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5195  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5196
5197  // Handle AVX2 in-register broadcasts.
5198  if (!IsLoad && Subtarget->hasInt256() &&
5199      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
5200    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5201
5202  // The scalar source must be a normal load.
5203  if (!IsLoad)
5204    return SDValue();
5205
5206  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
5207    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5208
5209  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5210  // double since there is no vbroadcastsd xmm
5211  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
5212    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5213      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5214  }
5215
5216  // Unsupported broadcast.
5217  return SDValue();
5218}
5219
5220SDValue
5221X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
5222  EVT VT = Op.getValueType();
5223
5224  // Skip if insert_vec_elt is not supported.
5225  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5226    return SDValue();
5227
5228  DebugLoc DL = Op.getDebugLoc();
5229  unsigned NumElems = Op.getNumOperands();
5230
5231  SDValue VecIn1;
5232  SDValue VecIn2;
5233  SmallVector<unsigned, 4> InsertIndices;
5234  SmallVector<int, 8> Mask(NumElems, -1);
5235
5236  for (unsigned i = 0; i != NumElems; ++i) {
5237    unsigned Opc = Op.getOperand(i).getOpcode();
5238
5239    if (Opc == ISD::UNDEF)
5240      continue;
5241
5242    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5243      // Quit if more than 1 elements need inserting.
5244      if (InsertIndices.size() > 1)
5245        return SDValue();
5246
5247      InsertIndices.push_back(i);
5248      continue;
5249    }
5250
5251    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5252    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5253
5254    // Quit if extracted from vector of different type.
5255    if (ExtractedFromVec.getValueType() != VT)
5256      return SDValue();
5257
5258    // Quit if non-constant index.
5259    if (!isa<ConstantSDNode>(ExtIdx))
5260      return SDValue();
5261
5262    if (VecIn1.getNode() == 0)
5263      VecIn1 = ExtractedFromVec;
5264    else if (VecIn1 != ExtractedFromVec) {
5265      if (VecIn2.getNode() == 0)
5266        VecIn2 = ExtractedFromVec;
5267      else if (VecIn2 != ExtractedFromVec)
5268        // Quit if more than 2 vectors to shuffle
5269        return SDValue();
5270    }
5271
5272    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5273
5274    if (ExtractedFromVec == VecIn1)
5275      Mask[i] = Idx;
5276    else if (ExtractedFromVec == VecIn2)
5277      Mask[i] = Idx + NumElems;
5278  }
5279
5280  if (VecIn1.getNode() == 0)
5281    return SDValue();
5282
5283  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5284  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
5285  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5286    unsigned Idx = InsertIndices[i];
5287    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
5288                     DAG.getIntPtrConstant(Idx));
5289  }
5290
5291  return NV;
5292}
5293
5294SDValue
5295X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5296  DebugLoc dl = Op.getDebugLoc();
5297
5298  EVT VT = Op.getValueType();
5299  EVT ExtVT = VT.getVectorElementType();
5300  unsigned NumElems = Op.getNumOperands();
5301
5302  // Vectors containing all zeros can be matched by pxor and xorps later
5303  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5304    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5305    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5306    if (VT == MVT::v4i32 || VT == MVT::v8i32)
5307      return Op;
5308
5309    return getZeroVector(VT, Subtarget, DAG, dl);
5310  }
5311
5312  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5313  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5314  // vpcmpeqd on 256-bit vectors.
5315  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5316    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
5317      return Op;
5318
5319    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
5320  }
5321
5322  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
5323  if (Broadcast.getNode())
5324    return Broadcast;
5325
5326  unsigned EVTBits = ExtVT.getSizeInBits();
5327
5328  unsigned NumZero  = 0;
5329  unsigned NumNonZero = 0;
5330  unsigned NonZeros = 0;
5331  bool IsAllConstants = true;
5332  SmallSet<SDValue, 8> Values;
5333  for (unsigned i = 0; i < NumElems; ++i) {
5334    SDValue Elt = Op.getOperand(i);
5335    if (Elt.getOpcode() == ISD::UNDEF)
5336      continue;
5337    Values.insert(Elt);
5338    if (Elt.getOpcode() != ISD::Constant &&
5339        Elt.getOpcode() != ISD::ConstantFP)
5340      IsAllConstants = false;
5341    if (X86::isZeroNode(Elt))
5342      NumZero++;
5343    else {
5344      NonZeros |= (1 << i);
5345      NumNonZero++;
5346    }
5347  }
5348
5349  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5350  if (NumNonZero == 0)
5351    return DAG.getUNDEF(VT);
5352
5353  // Special case for single non-zero, non-undef, element.
5354  if (NumNonZero == 1) {
5355    unsigned Idx = CountTrailingZeros_32(NonZeros);
5356    SDValue Item = Op.getOperand(Idx);
5357
5358    // If this is an insertion of an i64 value on x86-32, and if the top bits of
5359    // the value are obviously zero, truncate the value to i32 and do the
5360    // insertion that way.  Only do this if the value is non-constant or if the
5361    // value is a constant being inserted into element 0.  It is cheaper to do
5362    // a constant pool load than it is to do a movd + shuffle.
5363    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5364        (!IsAllConstants || Idx == 0)) {
5365      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5366        // Handle SSE only.
5367        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5368        EVT VecVT = MVT::v4i32;
5369        unsigned VecElts = 4;
5370
5371        // Truncate the value (which may itself be a constant) to i32, and
5372        // convert it to a vector with movd (S2V+shuffle to zero extend).
5373        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5374        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5375        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5376
5377        // Now we have our 32-bit value zero extended in the low element of
5378        // a vector.  If Idx != 0, swizzle it into place.
5379        if (Idx != 0) {
5380          SmallVector<int, 4> Mask;
5381          Mask.push_back(Idx);
5382          for (unsigned i = 1; i != VecElts; ++i)
5383            Mask.push_back(i);
5384          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5385                                      &Mask[0]);
5386        }
5387        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5388      }
5389    }
5390
5391    // If we have a constant or non-constant insertion into the low element of
5392    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5393    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5394    // depending on what the source datatype is.
5395    if (Idx == 0) {
5396      if (NumZero == 0)
5397        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5398
5399      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5400          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5401        if (VT.is256BitVector()) {
5402          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5403          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5404                             Item, DAG.getIntPtrConstant(0));
5405        }
5406        assert(VT.is128BitVector() && "Expected an SSE value type!");
5407        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5408        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5409        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5410      }
5411
5412      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5413        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5414        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5415        if (VT.is256BitVector()) {
5416          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5417          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5418        } else {
5419          assert(VT.is128BitVector() && "Expected an SSE value type!");
5420          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5421        }
5422        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5423      }
5424    }
5425
5426    // Is it a vector logical left shift?
5427    if (NumElems == 2 && Idx == 1 &&
5428        X86::isZeroNode(Op.getOperand(0)) &&
5429        !X86::isZeroNode(Op.getOperand(1))) {
5430      unsigned NumBits = VT.getSizeInBits();
5431      return getVShift(true, VT,
5432                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5433                                   VT, Op.getOperand(1)),
5434                       NumBits/2, DAG, *this, dl);
5435    }
5436
5437    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5438      return SDValue();
5439
5440    // Otherwise, if this is a vector with i32 or f32 elements, and the element
5441    // is a non-constant being inserted into an element other than the low one,
5442    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5443    // movd/movss) to move this into the low element, then shuffle it into
5444    // place.
5445    if (EVTBits == 32) {
5446      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5447
5448      // Turn it into a shuffle of zero and zero-extended scalar to vector.
5449      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5450      SmallVector<int, 8> MaskVec;
5451      for (unsigned i = 0; i != NumElems; ++i)
5452        MaskVec.push_back(i == Idx ? 0 : 1);
5453      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5454    }
5455  }
5456
5457  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5458  if (Values.size() == 1) {
5459    if (EVTBits == 32) {
5460      // Instead of a shuffle like this:
5461      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5462      // Check if it's possible to issue this instead.
5463      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5464      unsigned Idx = CountTrailingZeros_32(NonZeros);
5465      SDValue Item = Op.getOperand(Idx);
5466      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5467        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5468    }
5469    return SDValue();
5470  }
5471
5472  // A vector full of immediates; various special cases are already
5473  // handled, so this is best done with a single constant-pool load.
5474  if (IsAllConstants)
5475    return SDValue();
5476
5477  // For AVX-length vectors, build the individual 128-bit pieces and use
5478  // shuffles to put them in place.
5479  if (VT.is256BitVector()) {
5480    SmallVector<SDValue, 32> V;
5481    for (unsigned i = 0; i != NumElems; ++i)
5482      V.push_back(Op.getOperand(i));
5483
5484    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5485
5486    // Build both the lower and upper subvector.
5487    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
5488    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
5489                                NumElems/2);
5490
5491    // Recreate the wider vector with the lower and upper part.
5492    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5493  }
5494
5495  // Let legalizer expand 2-wide build_vectors.
5496  if (EVTBits == 64) {
5497    if (NumNonZero == 1) {
5498      // One half is zero or undef.
5499      unsigned Idx = CountTrailingZeros_32(NonZeros);
5500      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
5501                                 Op.getOperand(Idx));
5502      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
5503    }
5504    return SDValue();
5505  }
5506
5507  // If element VT is < 32 bits, convert it to inserts into a zero vector.
5508  if (EVTBits == 8 && NumElems == 16) {
5509    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
5510                                        Subtarget, *this);
5511    if (V.getNode()) return V;
5512  }
5513
5514  if (EVTBits == 16 && NumElems == 8) {
5515    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
5516                                      Subtarget, *this);
5517    if (V.getNode()) return V;
5518  }
5519
5520  // If element VT is == 32 bits, turn it into a number of shuffles.
5521  SmallVector<SDValue, 8> V(NumElems);
5522  if (NumElems == 4 && NumZero > 0) {
5523    for (unsigned i = 0; i < 4; ++i) {
5524      bool isZero = !(NonZeros & (1 << i));
5525      if (isZero)
5526        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
5527      else
5528        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5529    }
5530
5531    for (unsigned i = 0; i < 2; ++i) {
5532      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5533        default: break;
5534        case 0:
5535          V[i] = V[i*2];  // Must be a zero vector.
5536          break;
5537        case 1:
5538          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5539          break;
5540        case 2:
5541          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5542          break;
5543        case 3:
5544          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5545          break;
5546      }
5547    }
5548
5549    bool Reverse1 = (NonZeros & 0x3) == 2;
5550    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5551    int MaskVec[] = {
5552      Reverse1 ? 1 : 0,
5553      Reverse1 ? 0 : 1,
5554      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
5555      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
5556    };
5557    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
5558  }
5559
5560  if (Values.size() > 1 && VT.is128BitVector()) {
5561    // Check for a build vector of consecutive loads.
5562    for (unsigned i = 0; i < NumElems; ++i)
5563      V[i] = Op.getOperand(i);
5564
5565    // Check for elements which are consecutive loads.
5566    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
5567    if (LD.getNode())
5568      return LD;
5569
5570    // Check for a build vector from mostly shuffle plus few inserting.
5571    SDValue Sh = buildFromShuffleMostly(Op, DAG);
5572    if (Sh.getNode())
5573      return Sh;
5574
5575    // For SSE 4.1, use insertps to put the high elements into the low element.
5576    if (getSubtarget()->hasSSE41()) {
5577      SDValue Result;
5578      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
5579        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
5580      else
5581        Result = DAG.getUNDEF(VT);
5582
5583      for (unsigned i = 1; i < NumElems; ++i) {
5584        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
5585        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
5586                             Op.getOperand(i), DAG.getIntPtrConstant(i));
5587      }
5588      return Result;
5589    }
5590
5591    // Otherwise, expand into a number of unpckl*, start by extending each of
5592    // our (non-undef) elements to the full vector width with the element in the
5593    // bottom slot of the vector (which generates no code for SSE).
5594    for (unsigned i = 0; i < NumElems; ++i) {
5595      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
5596        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5597      else
5598        V[i] = DAG.getUNDEF(VT);
5599    }
5600
5601    // Next, we iteratively mix elements, e.g. for v4f32:
5602    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
5603    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
5604    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
5605    unsigned EltStride = NumElems >> 1;
5606    while (EltStride != 0) {
5607      for (unsigned i = 0; i < EltStride; ++i) {
5608        // If V[i+EltStride] is undef and this is the first round of mixing,
5609        // then it is safe to just drop this shuffle: V[i] is already in the
5610        // right place, the one element (since it's the first round) being
5611        // inserted as undef can be dropped.  This isn't safe for successive
5612        // rounds because they will permute elements within both vectors.
5613        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
5614            EltStride == NumElems/2)
5615          continue;
5616
5617        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
5618      }
5619      EltStride >>= 1;
5620    }
5621    return V[0];
5622  }
5623  return SDValue();
5624}
5625
5626// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
5627// to create 256-bit vectors from two other 128-bit ones.
5628static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5629  DebugLoc dl = Op.getDebugLoc();
5630  EVT ResVT = Op.getValueType();
5631
5632  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
5633
5634  SDValue V1 = Op.getOperand(0);
5635  SDValue V2 = Op.getOperand(1);
5636  unsigned NumElems = ResVT.getVectorNumElements();
5637
5638  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
5639}
5640
5641static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5642  assert(Op.getNumOperands() == 2);
5643
5644  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
5645  // from two other 128-bit ones.
5646  return LowerAVXCONCAT_VECTORS(Op, DAG);
5647}
5648
5649// Try to lower a shuffle node into a simple blend instruction.
5650static SDValue
5651LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
5652                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
5653  SDValue V1 = SVOp->getOperand(0);
5654  SDValue V2 = SVOp->getOperand(1);
5655  DebugLoc dl = SVOp->getDebugLoc();
5656  EVT VT = SVOp->getValueType(0);
5657  EVT EltVT = VT.getVectorElementType();
5658  unsigned NumElems = VT.getVectorNumElements();
5659
5660  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
5661    return SDValue();
5662  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
5663    return SDValue();
5664
5665  // Check the mask for BLEND and build the value.
5666  unsigned MaskValue = 0;
5667  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
5668  unsigned NumLanes = (NumElems-1)/8 + 1;
5669  unsigned NumElemsInLane = NumElems / NumLanes;
5670
5671  // Blend for v16i16 should be symetric for the both lanes.
5672  for (unsigned i = 0; i < NumElemsInLane; ++i) {
5673
5674    int SndLaneEltIdx = (NumLanes == 2) ?
5675      SVOp->getMaskElt(i + NumElemsInLane) : -1;
5676    int EltIdx = SVOp->getMaskElt(i);
5677
5678    if ((EltIdx == -1 || EltIdx == (int)i) &&
5679        (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
5680      continue;
5681
5682    if (((unsigned)EltIdx == (i + NumElems)) &&
5683        (SndLaneEltIdx == -1 ||
5684         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
5685      MaskValue |= (1<<i);
5686    else
5687      return SDValue();
5688  }
5689
5690  // Convert i32 vectors to floating point if it is not AVX2.
5691  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
5692  EVT BlendVT = VT;
5693  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
5694    BlendVT = EVT::getVectorVT(*DAG.getContext(),
5695                              EVT::getFloatingPointVT(EltVT.getSizeInBits()),
5696                              NumElems);
5697    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
5698    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
5699  }
5700
5701  SDValue Ret =  DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
5702                             DAG.getConstant(MaskValue, MVT::i32));
5703  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
5704}
5705
5706// v8i16 shuffles - Prefer shuffles in the following order:
5707// 1. [all]   pshuflw, pshufhw, optional move
5708// 2. [ssse3] 1 x pshufb
5709// 3. [ssse3] 2 x pshufb + 1 x por
5710// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
5711static SDValue
5712LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
5713                         SelectionDAG &DAG) {
5714  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5715  SDValue V1 = SVOp->getOperand(0);
5716  SDValue V2 = SVOp->getOperand(1);
5717  DebugLoc dl = SVOp->getDebugLoc();
5718  SmallVector<int, 8> MaskVals;
5719
5720  // Determine if more than 1 of the words in each of the low and high quadwords
5721  // of the result come from the same quadword of one of the two inputs.  Undef
5722  // mask values count as coming from any quadword, for better codegen.
5723  unsigned LoQuad[] = { 0, 0, 0, 0 };
5724  unsigned HiQuad[] = { 0, 0, 0, 0 };
5725  std::bitset<4> InputQuads;
5726  for (unsigned i = 0; i < 8; ++i) {
5727    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
5728    int EltIdx = SVOp->getMaskElt(i);
5729    MaskVals.push_back(EltIdx);
5730    if (EltIdx < 0) {
5731      ++Quad[0];
5732      ++Quad[1];
5733      ++Quad[2];
5734      ++Quad[3];
5735      continue;
5736    }
5737    ++Quad[EltIdx / 4];
5738    InputQuads.set(EltIdx / 4);
5739  }
5740
5741  int BestLoQuad = -1;
5742  unsigned MaxQuad = 1;
5743  for (unsigned i = 0; i < 4; ++i) {
5744    if (LoQuad[i] > MaxQuad) {
5745      BestLoQuad = i;
5746      MaxQuad = LoQuad[i];
5747    }
5748  }
5749
5750  int BestHiQuad = -1;
5751  MaxQuad = 1;
5752  for (unsigned i = 0; i < 4; ++i) {
5753    if (HiQuad[i] > MaxQuad) {
5754      BestHiQuad = i;
5755      MaxQuad = HiQuad[i];
5756    }
5757  }
5758
5759  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
5760  // of the two input vectors, shuffle them into one input vector so only a
5761  // single pshufb instruction is necessary. If There are more than 2 input
5762  // quads, disable the next transformation since it does not help SSSE3.
5763  bool V1Used = InputQuads[0] || InputQuads[1];
5764  bool V2Used = InputQuads[2] || InputQuads[3];
5765  if (Subtarget->hasSSSE3()) {
5766    if (InputQuads.count() == 2 && V1Used && V2Used) {
5767      BestLoQuad = InputQuads[0] ? 0 : 1;
5768      BestHiQuad = InputQuads[2] ? 2 : 3;
5769    }
5770    if (InputQuads.count() > 2) {
5771      BestLoQuad = -1;
5772      BestHiQuad = -1;
5773    }
5774  }
5775
5776  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
5777  // the shuffle mask.  If a quad is scored as -1, that means that it contains
5778  // words from all 4 input quadwords.
5779  SDValue NewV;
5780  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
5781    int MaskV[] = {
5782      BestLoQuad < 0 ? 0 : BestLoQuad,
5783      BestHiQuad < 0 ? 1 : BestHiQuad
5784    };
5785    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
5786                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
5787                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
5788    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
5789
5790    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
5791    // source words for the shuffle, to aid later transformations.
5792    bool AllWordsInNewV = true;
5793    bool InOrder[2] = { true, true };
5794    for (unsigned i = 0; i != 8; ++i) {
5795      int idx = MaskVals[i];
5796      if (idx != (int)i)
5797        InOrder[i/4] = false;
5798      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
5799        continue;
5800      AllWordsInNewV = false;
5801      break;
5802    }
5803
5804    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
5805    if (AllWordsInNewV) {
5806      for (int i = 0; i != 8; ++i) {
5807        int idx = MaskVals[i];
5808        if (idx < 0)
5809          continue;
5810        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
5811        if ((idx != i) && idx < 4)
5812          pshufhw = false;
5813        if ((idx != i) && idx > 3)
5814          pshuflw = false;
5815      }
5816      V1 = NewV;
5817      V2Used = false;
5818      BestLoQuad = 0;
5819      BestHiQuad = 1;
5820    }
5821
5822    // If we've eliminated the use of V2, and the new mask is a pshuflw or
5823    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
5824    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
5825      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
5826      unsigned TargetMask = 0;
5827      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
5828                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
5829      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5830      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
5831                             getShufflePSHUFLWImmediate(SVOp);
5832      V1 = NewV.getOperand(0);
5833      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
5834    }
5835  }
5836
5837  // If we have SSSE3, and all words of the result are from 1 input vector,
5838  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
5839  // is present, fall back to case 4.
5840  if (Subtarget->hasSSSE3()) {
5841    SmallVector<SDValue,16> pshufbMask;
5842
5843    // If we have elements from both input vectors, set the high bit of the
5844    // shuffle mask element to zero out elements that come from V2 in the V1
5845    // mask, and elements that come from V1 in the V2 mask, so that the two
5846    // results can be OR'd together.
5847    bool TwoInputs = V1Used && V2Used;
5848    for (unsigned i = 0; i != 8; ++i) {
5849      int EltIdx = MaskVals[i] * 2;
5850      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
5851      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
5852      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
5853      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5854    }
5855    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
5856    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5857                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5858                                 MVT::v16i8, &pshufbMask[0], 16));
5859    if (!TwoInputs)
5860      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5861
5862    // Calculate the shuffle mask for the second input, shuffle it, and
5863    // OR it with the first shuffled input.
5864    pshufbMask.clear();
5865    for (unsigned i = 0; i != 8; ++i) {
5866      int EltIdx = MaskVals[i] * 2;
5867      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
5868      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
5869      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
5870      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5871    }
5872    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
5873    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5874                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5875                                 MVT::v16i8, &pshufbMask[0], 16));
5876    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5877    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5878  }
5879
5880  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
5881  // and update MaskVals with new element order.
5882  std::bitset<8> InOrder;
5883  if (BestLoQuad >= 0) {
5884    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
5885    for (int i = 0; i != 4; ++i) {
5886      int idx = MaskVals[i];
5887      if (idx < 0) {
5888        InOrder.set(i);
5889      } else if ((idx / 4) == BestLoQuad) {
5890        MaskV[i] = idx & 3;
5891        InOrder.set(i);
5892      }
5893    }
5894    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5895                                &MaskV[0]);
5896
5897    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5898      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5899      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
5900                                  NewV.getOperand(0),
5901                                  getShufflePSHUFLWImmediate(SVOp), DAG);
5902    }
5903  }
5904
5905  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
5906  // and update MaskVals with the new element order.
5907  if (BestHiQuad >= 0) {
5908    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
5909    for (unsigned i = 4; i != 8; ++i) {
5910      int idx = MaskVals[i];
5911      if (idx < 0) {
5912        InOrder.set(i);
5913      } else if ((idx / 4) == BestHiQuad) {
5914        MaskV[i] = (idx & 3) + 4;
5915        InOrder.set(i);
5916      }
5917    }
5918    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5919                                &MaskV[0]);
5920
5921    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5922      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5923      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
5924                                  NewV.getOperand(0),
5925                                  getShufflePSHUFHWImmediate(SVOp), DAG);
5926    }
5927  }
5928
5929  // In case BestHi & BestLo were both -1, which means each quadword has a word
5930  // from each of the four input quadwords, calculate the InOrder bitvector now
5931  // before falling through to the insert/extract cleanup.
5932  if (BestLoQuad == -1 && BestHiQuad == -1) {
5933    NewV = V1;
5934    for (int i = 0; i != 8; ++i)
5935      if (MaskVals[i] < 0 || MaskVals[i] == i)
5936        InOrder.set(i);
5937  }
5938
5939  // The other elements are put in the right place using pextrw and pinsrw.
5940  for (unsigned i = 0; i != 8; ++i) {
5941    if (InOrder[i])
5942      continue;
5943    int EltIdx = MaskVals[i];
5944    if (EltIdx < 0)
5945      continue;
5946    SDValue ExtOp = (EltIdx < 8) ?
5947      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
5948                  DAG.getIntPtrConstant(EltIdx)) :
5949      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
5950                  DAG.getIntPtrConstant(EltIdx - 8));
5951    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
5952                       DAG.getIntPtrConstant(i));
5953  }
5954  return NewV;
5955}
5956
5957// v16i8 shuffles - Prefer shuffles in the following order:
5958// 1. [ssse3] 1 x pshufb
5959// 2. [ssse3] 2 x pshufb + 1 x por
5960// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
5961static
5962SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
5963                                 SelectionDAG &DAG,
5964                                 const X86TargetLowering &TLI) {
5965  SDValue V1 = SVOp->getOperand(0);
5966  SDValue V2 = SVOp->getOperand(1);
5967  DebugLoc dl = SVOp->getDebugLoc();
5968  ArrayRef<int> MaskVals = SVOp->getMask();
5969
5970  // If we have SSSE3, case 1 is generated when all result bytes come from
5971  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
5972  // present, fall back to case 3.
5973
5974  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
5975  if (TLI.getSubtarget()->hasSSSE3()) {
5976    SmallVector<SDValue,16> pshufbMask;
5977
5978    // If all result elements are from one input vector, then only translate
5979    // undef mask values to 0x80 (zero out result) in the pshufb mask.
5980    //
5981    // Otherwise, we have elements from both input vectors, and must zero out
5982    // elements that come from V2 in the first mask, and V1 in the second mask
5983    // so that we can OR them together.
5984    for (unsigned i = 0; i != 16; ++i) {
5985      int EltIdx = MaskVals[i];
5986      if (EltIdx < 0 || EltIdx >= 16)
5987        EltIdx = 0x80;
5988      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
5989    }
5990    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5991                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5992                                 MVT::v16i8, &pshufbMask[0], 16));
5993
5994    // As PSHUFB will zero elements with negative indices, it's safe to ignore
5995    // the 2nd operand if it's undefined or zero.
5996    if (V2.getOpcode() == ISD::UNDEF ||
5997        ISD::isBuildVectorAllZeros(V2.getNode()))
5998      return V1;
5999
6000    // Calculate the shuffle mask for the second input, shuffle it, and
6001    // OR it with the first shuffled input.
6002    pshufbMask.clear();
6003    for (unsigned i = 0; i != 16; ++i) {
6004      int EltIdx = MaskVals[i];
6005      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6006      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6007    }
6008    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6009                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6010                                 MVT::v16i8, &pshufbMask[0], 16));
6011    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6012  }
6013
6014  // No SSSE3 - Calculate in place words and then fix all out of place words
6015  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
6016  // the 16 different words that comprise the two doublequadword input vectors.
6017  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6018  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
6019  SDValue NewV = V1;
6020  for (int i = 0; i != 8; ++i) {
6021    int Elt0 = MaskVals[i*2];
6022    int Elt1 = MaskVals[i*2+1];
6023
6024    // This word of the result is all undef, skip it.
6025    if (Elt0 < 0 && Elt1 < 0)
6026      continue;
6027
6028    // This word of the result is already in the correct place, skip it.
6029    if ((Elt0 == i*2) && (Elt1 == i*2+1))
6030      continue;
6031
6032    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
6033    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
6034    SDValue InsElt;
6035
6036    // If Elt0 and Elt1 are defined, are consecutive, and can be load
6037    // using a single extract together, load it and store it.
6038    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
6039      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6040                           DAG.getIntPtrConstant(Elt1 / 2));
6041      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6042                        DAG.getIntPtrConstant(i));
6043      continue;
6044    }
6045
6046    // If Elt1 is defined, extract it from the appropriate source.  If the
6047    // source byte is not also odd, shift the extracted word left 8 bits
6048    // otherwise clear the bottom 8 bits if we need to do an or.
6049    if (Elt1 >= 0) {
6050      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6051                           DAG.getIntPtrConstant(Elt1 / 2));
6052      if ((Elt1 & 1) == 0)
6053        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
6054                             DAG.getConstant(8,
6055                                  TLI.getShiftAmountTy(InsElt.getValueType())));
6056      else if (Elt0 >= 0)
6057        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
6058                             DAG.getConstant(0xFF00, MVT::i16));
6059    }
6060    // If Elt0 is defined, extract it from the appropriate source.  If the
6061    // source byte is not also even, shift the extracted word right 8 bits. If
6062    // Elt1 was also defined, OR the extracted values together before
6063    // inserting them in the result.
6064    if (Elt0 >= 0) {
6065      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
6066                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
6067      if ((Elt0 & 1) != 0)
6068        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
6069                              DAG.getConstant(8,
6070                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
6071      else if (Elt1 >= 0)
6072        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
6073                             DAG.getConstant(0x00FF, MVT::i16));
6074      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
6075                         : InsElt0;
6076    }
6077    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6078                       DAG.getIntPtrConstant(i));
6079  }
6080  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
6081}
6082
6083// v32i8 shuffles - Translate to VPSHUFB if possible.
6084static
6085SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
6086                                 const X86Subtarget *Subtarget,
6087                                 SelectionDAG &DAG) {
6088  EVT VT = SVOp->getValueType(0);
6089  SDValue V1 = SVOp->getOperand(0);
6090  SDValue V2 = SVOp->getOperand(1);
6091  DebugLoc dl = SVOp->getDebugLoc();
6092  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
6093
6094  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6095  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
6096  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
6097
6098  // VPSHUFB may be generated if
6099  // (1) one of input vector is undefined or zeroinitializer.
6100  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
6101  // And (2) the mask indexes don't cross the 128-bit lane.
6102  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
6103      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
6104    return SDValue();
6105
6106  if (V1IsAllZero && !V2IsAllZero) {
6107    CommuteVectorShuffleMask(MaskVals, 32);
6108    V1 = V2;
6109  }
6110  SmallVector<SDValue, 32> pshufbMask;
6111  for (unsigned i = 0; i != 32; i++) {
6112    int EltIdx = MaskVals[i];
6113    if (EltIdx < 0 || EltIdx >= 32)
6114      EltIdx = 0x80;
6115    else {
6116      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
6117        // Cross lane is not allowed.
6118        return SDValue();
6119      EltIdx &= 0xf;
6120    }
6121    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6122  }
6123  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
6124                      DAG.getNode(ISD::BUILD_VECTOR, dl,
6125                                  MVT::v32i8, &pshufbMask[0], 32));
6126}
6127
6128/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
6129/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
6130/// done when every pair / quad of shuffle mask elements point to elements in
6131/// the right sequence. e.g.
6132/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
6133static
6134SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
6135                                 SelectionDAG &DAG, DebugLoc dl) {
6136  MVT VT = SVOp->getValueType(0).getSimpleVT();
6137  unsigned NumElems = VT.getVectorNumElements();
6138  MVT NewVT;
6139  unsigned Scale;
6140  switch (VT.SimpleTy) {
6141  default: llvm_unreachable("Unexpected!");
6142  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
6143  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
6144  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
6145  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
6146  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
6147  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
6148  }
6149
6150  SmallVector<int, 8> MaskVec;
6151  for (unsigned i = 0; i != NumElems; i += Scale) {
6152    int StartIdx = -1;
6153    for (unsigned j = 0; j != Scale; ++j) {
6154      int EltIdx = SVOp->getMaskElt(i+j);
6155      if (EltIdx < 0)
6156        continue;
6157      if (StartIdx < 0)
6158        StartIdx = (EltIdx / Scale);
6159      if (EltIdx != (int)(StartIdx*Scale + j))
6160        return SDValue();
6161    }
6162    MaskVec.push_back(StartIdx);
6163  }
6164
6165  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
6166  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
6167  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
6168}
6169
6170/// getVZextMovL - Return a zero-extending vector move low node.
6171///
6172static SDValue getVZextMovL(EVT VT, EVT OpVT,
6173                            SDValue SrcOp, SelectionDAG &DAG,
6174                            const X86Subtarget *Subtarget, DebugLoc dl) {
6175  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
6176    LoadSDNode *LD = NULL;
6177    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
6178      LD = dyn_cast<LoadSDNode>(SrcOp);
6179    if (!LD) {
6180      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
6181      // instead.
6182      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
6183      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
6184          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6185          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
6186          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
6187        // PR2108
6188        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
6189        return DAG.getNode(ISD::BITCAST, dl, VT,
6190                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6191                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6192                                                   OpVT,
6193                                                   SrcOp.getOperand(0)
6194                                                          .getOperand(0))));
6195      }
6196    }
6197  }
6198
6199  return DAG.getNode(ISD::BITCAST, dl, VT,
6200                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6201                                 DAG.getNode(ISD::BITCAST, dl,
6202                                             OpVT, SrcOp)));
6203}
6204
6205/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
6206/// which could not be matched by any known target speficic shuffle
6207static SDValue
6208LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6209
6210  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
6211  if (NewOp.getNode())
6212    return NewOp;
6213
6214  EVT VT = SVOp->getValueType(0);
6215
6216  unsigned NumElems = VT.getVectorNumElements();
6217  unsigned NumLaneElems = NumElems / 2;
6218
6219  DebugLoc dl = SVOp->getDebugLoc();
6220  MVT EltVT = VT.getVectorElementType().getSimpleVT();
6221  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
6222  SDValue Output[2];
6223
6224  SmallVector<int, 16> Mask;
6225  for (unsigned l = 0; l < 2; ++l) {
6226    // Build a shuffle mask for the output, discovering on the fly which
6227    // input vectors to use as shuffle operands (recorded in InputUsed).
6228    // If building a suitable shuffle vector proves too hard, then bail
6229    // out with UseBuildVector set.
6230    bool UseBuildVector = false;
6231    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
6232    unsigned LaneStart = l * NumLaneElems;
6233    for (unsigned i = 0; i != NumLaneElems; ++i) {
6234      // The mask element.  This indexes into the input.
6235      int Idx = SVOp->getMaskElt(i+LaneStart);
6236      if (Idx < 0) {
6237        // the mask element does not index into any input vector.
6238        Mask.push_back(-1);
6239        continue;
6240      }
6241
6242      // The input vector this mask element indexes into.
6243      int Input = Idx / NumLaneElems;
6244
6245      // Turn the index into an offset from the start of the input vector.
6246      Idx -= Input * NumLaneElems;
6247
6248      // Find or create a shuffle vector operand to hold this input.
6249      unsigned OpNo;
6250      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6251        if (InputUsed[OpNo] == Input)
6252          // This input vector is already an operand.
6253          break;
6254        if (InputUsed[OpNo] < 0) {
6255          // Create a new operand for this input vector.
6256          InputUsed[OpNo] = Input;
6257          break;
6258        }
6259      }
6260
6261      if (OpNo >= array_lengthof(InputUsed)) {
6262        // More than two input vectors used!  Give up on trying to create a
6263        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
6264        UseBuildVector = true;
6265        break;
6266      }
6267
6268      // Add the mask index for the new shuffle vector.
6269      Mask.push_back(Idx + OpNo * NumLaneElems);
6270    }
6271
6272    if (UseBuildVector) {
6273      SmallVector<SDValue, 16> SVOps;
6274      for (unsigned i = 0; i != NumLaneElems; ++i) {
6275        // The mask element.  This indexes into the input.
6276        int Idx = SVOp->getMaskElt(i+LaneStart);
6277        if (Idx < 0) {
6278          SVOps.push_back(DAG.getUNDEF(EltVT));
6279          continue;
6280        }
6281
6282        // The input vector this mask element indexes into.
6283        int Input = Idx / NumElems;
6284
6285        // Turn the index into an offset from the start of the input vector.
6286        Idx -= Input * NumElems;
6287
6288        // Extract the vector element by hand.
6289        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6290                                    SVOp->getOperand(Input),
6291                                    DAG.getIntPtrConstant(Idx)));
6292      }
6293
6294      // Construct the output using a BUILD_VECTOR.
6295      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
6296                              SVOps.size());
6297    } else if (InputUsed[0] < 0) {
6298      // No input vectors were used! The result is undefined.
6299      Output[l] = DAG.getUNDEF(NVT);
6300    } else {
6301      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6302                                        (InputUsed[0] % 2) * NumLaneElems,
6303                                        DAG, dl);
6304      // If only one input was used, use an undefined vector for the other.
6305      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6306        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6307                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6308      // At least one input vector was used. Create a new shuffle vector.
6309      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6310    }
6311
6312    Mask.clear();
6313  }
6314
6315  // Concatenate the result back
6316  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
6317}
6318
6319/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6320/// 4 elements, and match them with several different shuffle types.
6321static SDValue
6322LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6323  SDValue V1 = SVOp->getOperand(0);
6324  SDValue V2 = SVOp->getOperand(1);
6325  DebugLoc dl = SVOp->getDebugLoc();
6326  EVT VT = SVOp->getValueType(0);
6327
6328  assert(VT.is128BitVector() && "Unsupported vector size");
6329
6330  std::pair<int, int> Locs[4];
6331  int Mask1[] = { -1, -1, -1, -1 };
6332  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6333
6334  unsigned NumHi = 0;
6335  unsigned NumLo = 0;
6336  for (unsigned i = 0; i != 4; ++i) {
6337    int Idx = PermMask[i];
6338    if (Idx < 0) {
6339      Locs[i] = std::make_pair(-1, -1);
6340    } else {
6341      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6342      if (Idx < 4) {
6343        Locs[i] = std::make_pair(0, NumLo);
6344        Mask1[NumLo] = Idx;
6345        NumLo++;
6346      } else {
6347        Locs[i] = std::make_pair(1, NumHi);
6348        if (2+NumHi < 4)
6349          Mask1[2+NumHi] = Idx;
6350        NumHi++;
6351      }
6352    }
6353  }
6354
6355  if (NumLo <= 2 && NumHi <= 2) {
6356    // If no more than two elements come from either vector. This can be
6357    // implemented with two shuffles. First shuffle gather the elements.
6358    // The second shuffle, which takes the first shuffle as both of its
6359    // vector operands, put the elements into the right order.
6360    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6361
6362    int Mask2[] = { -1, -1, -1, -1 };
6363
6364    for (unsigned i = 0; i != 4; ++i)
6365      if (Locs[i].first != -1) {
6366        unsigned Idx = (i < 2) ? 0 : 4;
6367        Idx += Locs[i].first * 2 + Locs[i].second;
6368        Mask2[i] = Idx;
6369      }
6370
6371    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6372  }
6373
6374  if (NumLo == 3 || NumHi == 3) {
6375    // Otherwise, we must have three elements from one vector, call it X, and
6376    // one element from the other, call it Y.  First, use a shufps to build an
6377    // intermediate vector with the one element from Y and the element from X
6378    // that will be in the same half in the final destination (the indexes don't
6379    // matter). Then, use a shufps to build the final vector, taking the half
6380    // containing the element from Y from the intermediate, and the other half
6381    // from X.
6382    if (NumHi == 3) {
6383      // Normalize it so the 3 elements come from V1.
6384      CommuteVectorShuffleMask(PermMask, 4);
6385      std::swap(V1, V2);
6386    }
6387
6388    // Find the element from V2.
6389    unsigned HiIndex;
6390    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6391      int Val = PermMask[HiIndex];
6392      if (Val < 0)
6393        continue;
6394      if (Val >= 4)
6395        break;
6396    }
6397
6398    Mask1[0] = PermMask[HiIndex];
6399    Mask1[1] = -1;
6400    Mask1[2] = PermMask[HiIndex^1];
6401    Mask1[3] = -1;
6402    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6403
6404    if (HiIndex >= 2) {
6405      Mask1[0] = PermMask[0];
6406      Mask1[1] = PermMask[1];
6407      Mask1[2] = HiIndex & 1 ? 6 : 4;
6408      Mask1[3] = HiIndex & 1 ? 4 : 6;
6409      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6410    }
6411
6412    Mask1[0] = HiIndex & 1 ? 2 : 0;
6413    Mask1[1] = HiIndex & 1 ? 0 : 2;
6414    Mask1[2] = PermMask[2];
6415    Mask1[3] = PermMask[3];
6416    if (Mask1[2] >= 0)
6417      Mask1[2] += 4;
6418    if (Mask1[3] >= 0)
6419      Mask1[3] += 4;
6420    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6421  }
6422
6423  // Break it into (shuffle shuffle_hi, shuffle_lo).
6424  int LoMask[] = { -1, -1, -1, -1 };
6425  int HiMask[] = { -1, -1, -1, -1 };
6426
6427  int *MaskPtr = LoMask;
6428  unsigned MaskIdx = 0;
6429  unsigned LoIdx = 0;
6430  unsigned HiIdx = 2;
6431  for (unsigned i = 0; i != 4; ++i) {
6432    if (i == 2) {
6433      MaskPtr = HiMask;
6434      MaskIdx = 1;
6435      LoIdx = 0;
6436      HiIdx = 2;
6437    }
6438    int Idx = PermMask[i];
6439    if (Idx < 0) {
6440      Locs[i] = std::make_pair(-1, -1);
6441    } else if (Idx < 4) {
6442      Locs[i] = std::make_pair(MaskIdx, LoIdx);
6443      MaskPtr[LoIdx] = Idx;
6444      LoIdx++;
6445    } else {
6446      Locs[i] = std::make_pair(MaskIdx, HiIdx);
6447      MaskPtr[HiIdx] = Idx;
6448      HiIdx++;
6449    }
6450  }
6451
6452  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6453  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6454  int MaskOps[] = { -1, -1, -1, -1 };
6455  for (unsigned i = 0; i != 4; ++i)
6456    if (Locs[i].first != -1)
6457      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6458  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6459}
6460
6461static bool MayFoldVectorLoad(SDValue V) {
6462  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6463    V = V.getOperand(0);
6464
6465  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6466    V = V.getOperand(0);
6467  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6468      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6469    // BUILD_VECTOR (load), undef
6470    V = V.getOperand(0);
6471
6472  return MayFoldLoad(V);
6473}
6474
6475static
6476SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
6477  EVT VT = Op.getValueType();
6478
6479  // Canonizalize to v2f64.
6480  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
6481  return DAG.getNode(ISD::BITCAST, dl, VT,
6482                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
6483                                          V1, DAG));
6484}
6485
6486static
6487SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
6488                        bool HasSSE2) {
6489  SDValue V1 = Op.getOperand(0);
6490  SDValue V2 = Op.getOperand(1);
6491  EVT VT = Op.getValueType();
6492
6493  assert(VT != MVT::v2i64 && "unsupported shuffle type");
6494
6495  if (HasSSE2 && VT == MVT::v2f64)
6496    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
6497
6498  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
6499  return DAG.getNode(ISD::BITCAST, dl, VT,
6500                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
6501                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
6502                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
6503}
6504
6505static
6506SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
6507  SDValue V1 = Op.getOperand(0);
6508  SDValue V2 = Op.getOperand(1);
6509  EVT VT = Op.getValueType();
6510
6511  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
6512         "unsupported shuffle type");
6513
6514  if (V2.getOpcode() == ISD::UNDEF)
6515    V2 = V1;
6516
6517  // v4i32 or v4f32
6518  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
6519}
6520
6521static
6522SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
6523  SDValue V1 = Op.getOperand(0);
6524  SDValue V2 = Op.getOperand(1);
6525  EVT VT = Op.getValueType();
6526  unsigned NumElems = VT.getVectorNumElements();
6527
6528  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
6529  // operand of these instructions is only memory, so check if there's a
6530  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
6531  // same masks.
6532  bool CanFoldLoad = false;
6533
6534  // Trivial case, when V2 comes from a load.
6535  if (MayFoldVectorLoad(V2))
6536    CanFoldLoad = true;
6537
6538  // When V1 is a load, it can be folded later into a store in isel, example:
6539  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
6540  //    turns into:
6541  //  (MOVLPSmr addr:$src1, VR128:$src2)
6542  // So, recognize this potential and also use MOVLPS or MOVLPD
6543  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
6544    CanFoldLoad = true;
6545
6546  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6547  if (CanFoldLoad) {
6548    if (HasSSE2 && NumElems == 2)
6549      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
6550
6551    if (NumElems == 4)
6552      // If we don't care about the second element, proceed to use movss.
6553      if (SVOp->getMaskElt(1) != -1)
6554        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
6555  }
6556
6557  // movl and movlp will both match v2i64, but v2i64 is never matched by
6558  // movl earlier because we make it strict to avoid messing with the movlp load
6559  // folding logic (see the code above getMOVLP call). Match it here then,
6560  // this is horrible, but will stay like this until we move all shuffle
6561  // matching to x86 specific nodes. Note that for the 1st condition all
6562  // types are matched with movsd.
6563  if (HasSSE2) {
6564    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
6565    // as to remove this logic from here, as much as possible
6566    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
6567      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6568    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6569  }
6570
6571  assert(VT != MVT::v4i32 && "unsupported shuffle type");
6572
6573  // Invert the operand order and use SHUFPS to match it.
6574  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
6575                              getShuffleSHUFImmediate(SVOp), DAG);
6576}
6577
6578// Reduce a vector shuffle to zext.
6579SDValue
6580X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
6581  // PMOVZX is only available from SSE41.
6582  if (!Subtarget->hasSSE41())
6583    return SDValue();
6584
6585  EVT VT = Op.getValueType();
6586
6587  // Only AVX2 support 256-bit vector integer extending.
6588  if (!Subtarget->hasInt256() && VT.is256BitVector())
6589    return SDValue();
6590
6591  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6592  DebugLoc DL = Op.getDebugLoc();
6593  SDValue V1 = Op.getOperand(0);
6594  SDValue V2 = Op.getOperand(1);
6595  unsigned NumElems = VT.getVectorNumElements();
6596
6597  // Extending is an unary operation and the element type of the source vector
6598  // won't be equal to or larger than i64.
6599  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
6600      VT.getVectorElementType() == MVT::i64)
6601    return SDValue();
6602
6603  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
6604  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
6605  while ((1U << Shift) < NumElems) {
6606    if (SVOp->getMaskElt(1U << Shift) == 1)
6607      break;
6608    Shift += 1;
6609    // The maximal ratio is 8, i.e. from i8 to i64.
6610    if (Shift > 3)
6611      return SDValue();
6612  }
6613
6614  // Check the shuffle mask.
6615  unsigned Mask = (1U << Shift) - 1;
6616  for (unsigned i = 0; i != NumElems; ++i) {
6617    int EltIdx = SVOp->getMaskElt(i);
6618    if ((i & Mask) != 0 && EltIdx != -1)
6619      return SDValue();
6620    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
6621      return SDValue();
6622  }
6623
6624  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
6625  EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
6626  EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
6627
6628  if (!isTypeLegal(NVT))
6629    return SDValue();
6630
6631  // Simplify the operand as it's prepared to be fed into shuffle.
6632  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
6633  if (V1.getOpcode() == ISD::BITCAST &&
6634      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6635      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6636      V1.getOperand(0)
6637        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
6638    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
6639    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
6640    ConstantSDNode *CIdx =
6641      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
6642    // If it's foldable, i.e. normal load with single use, we will let code
6643    // selection to fold it. Otherwise, we will short the conversion sequence.
6644    if (CIdx && CIdx->getZExtValue() == 0 &&
6645        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
6646      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
6647  }
6648
6649  return DAG.getNode(ISD::BITCAST, DL, VT,
6650                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
6651}
6652
6653SDValue
6654X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
6655  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6656  EVT VT = Op.getValueType();
6657  DebugLoc dl = Op.getDebugLoc();
6658  SDValue V1 = Op.getOperand(0);
6659  SDValue V2 = Op.getOperand(1);
6660
6661  if (isZeroShuffle(SVOp))
6662    return getZeroVector(VT, Subtarget, DAG, dl);
6663
6664  // Handle splat operations
6665  if (SVOp->isSplat()) {
6666    unsigned NumElem = VT.getVectorNumElements();
6667    int Size = VT.getSizeInBits();
6668
6669    // Use vbroadcast whenever the splat comes from a foldable load
6670    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
6671    if (Broadcast.getNode())
6672      return Broadcast;
6673
6674    // Handle splats by matching through known shuffle masks
6675    if ((Size == 128 && NumElem <= 4) ||
6676        (Size == 256 && NumElem <= 8))
6677      return SDValue();
6678
6679    // All remaning splats are promoted to target supported vector shuffles.
6680    return PromoteSplat(SVOp, DAG);
6681  }
6682
6683  // Check integer expanding shuffles.
6684  SDValue NewOp = lowerVectorIntExtend(Op, DAG);
6685  if (NewOp.getNode())
6686    return NewOp;
6687
6688  // If the shuffle can be profitably rewritten as a narrower shuffle, then
6689  // do it!
6690  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
6691      VT == MVT::v16i16 || VT == MVT::v32i8) {
6692    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6693    if (NewOp.getNode())
6694      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
6695  } else if ((VT == MVT::v4i32 ||
6696             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
6697    // FIXME: Figure out a cleaner way to do this.
6698    // Try to make use of movq to zero out the top part.
6699    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
6700      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6701      if (NewOp.getNode()) {
6702        EVT NewVT = NewOp.getValueType();
6703        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
6704                               NewVT, true, false))
6705          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
6706                              DAG, Subtarget, dl);
6707      }
6708    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
6709      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6710      if (NewOp.getNode()) {
6711        EVT NewVT = NewOp.getValueType();
6712        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
6713          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
6714                              DAG, Subtarget, dl);
6715      }
6716    }
6717  }
6718  return SDValue();
6719}
6720
6721SDValue
6722X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
6723  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6724  SDValue V1 = Op.getOperand(0);
6725  SDValue V2 = Op.getOperand(1);
6726  EVT VT = Op.getValueType();
6727  DebugLoc dl = Op.getDebugLoc();
6728  unsigned NumElems = VT.getVectorNumElements();
6729  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
6730  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6731  bool V1IsSplat = false;
6732  bool V2IsSplat = false;
6733  bool HasSSE2 = Subtarget->hasSSE2();
6734  bool HasFp256    = Subtarget->hasFp256();
6735  bool HasInt256   = Subtarget->hasInt256();
6736  MachineFunction &MF = DAG.getMachineFunction();
6737  bool OptForSize = MF.getFunction()->getFnAttributes().
6738    hasAttribute(Attributes::OptimizeForSize);
6739
6740  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
6741
6742  if (V1IsUndef && V2IsUndef)
6743    return DAG.getUNDEF(VT);
6744
6745  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
6746
6747  // Vector shuffle lowering takes 3 steps:
6748  //
6749  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
6750  //    narrowing and commutation of operands should be handled.
6751  // 2) Matching of shuffles with known shuffle masks to x86 target specific
6752  //    shuffle nodes.
6753  // 3) Rewriting of unmatched masks into new generic shuffle operations,
6754  //    so the shuffle can be broken into other shuffles and the legalizer can
6755  //    try the lowering again.
6756  //
6757  // The general idea is that no vector_shuffle operation should be left to
6758  // be matched during isel, all of them must be converted to a target specific
6759  // node here.
6760
6761  // Normalize the input vectors. Here splats, zeroed vectors, profitable
6762  // narrowing and commutation of operands should be handled. The actual code
6763  // doesn't include all of those, work in progress...
6764  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
6765  if (NewOp.getNode())
6766    return NewOp;
6767
6768  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
6769
6770  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
6771  // unpckh_undef). Only use pshufd if speed is more important than size.
6772  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
6773    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6774  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
6775    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6776
6777  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
6778      V2IsUndef && MayFoldVectorLoad(V1))
6779    return getMOVDDup(Op, dl, V1, DAG);
6780
6781  if (isMOVHLPS_v_undef_Mask(M, VT))
6782    return getMOVHighToLow(Op, dl, DAG);
6783
6784  // Use to match splats
6785  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
6786      (VT == MVT::v2f64 || VT == MVT::v2i64))
6787    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6788
6789  if (isPSHUFDMask(M, VT)) {
6790    // The actual implementation will match the mask in the if above and then
6791    // during isel it can match several different instructions, not only pshufd
6792    // as its name says, sad but true, emulate the behavior for now...
6793    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
6794      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
6795
6796    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
6797
6798    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
6799      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
6800
6801    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
6802      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
6803                                  DAG);
6804
6805    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
6806                                TargetMask, DAG);
6807  }
6808
6809  // Check if this can be converted into a logical shift.
6810  bool isLeft = false;
6811  unsigned ShAmt = 0;
6812  SDValue ShVal;
6813  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
6814  if (isShift && ShVal.hasOneUse()) {
6815    // If the shifted value has multiple uses, it may be cheaper to use
6816    // v_set0 + movlhps or movhlps, etc.
6817    EVT EltVT = VT.getVectorElementType();
6818    ShAmt *= EltVT.getSizeInBits();
6819    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6820  }
6821
6822  if (isMOVLMask(M, VT)) {
6823    if (ISD::isBuildVectorAllZeros(V1.getNode()))
6824      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
6825    if (!isMOVLPMask(M, VT)) {
6826      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6827        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6828
6829      if (VT == MVT::v4i32 || VT == MVT::v4f32)
6830        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6831    }
6832  }
6833
6834  // FIXME: fold these into legal mask.
6835  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
6836    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
6837
6838  if (isMOVHLPSMask(M, VT))
6839    return getMOVHighToLow(Op, dl, DAG);
6840
6841  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
6842    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
6843
6844  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
6845    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
6846
6847  if (isMOVLPMask(M, VT))
6848    return getMOVLP(Op, dl, DAG, HasSSE2);
6849
6850  if (ShouldXformToMOVHLPS(M, VT) ||
6851      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
6852    return CommuteVectorShuffle(SVOp, DAG);
6853
6854  if (isShift) {
6855    // No better options. Use a vshldq / vsrldq.
6856    EVT EltVT = VT.getVectorElementType();
6857    ShAmt *= EltVT.getSizeInBits();
6858    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6859  }
6860
6861  bool Commuted = false;
6862  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
6863  // 1,1,1,1 -> v8i16 though.
6864  V1IsSplat = isSplatVector(V1.getNode());
6865  V2IsSplat = isSplatVector(V2.getNode());
6866
6867  // Canonicalize the splat or undef, if present, to be on the RHS.
6868  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
6869    CommuteVectorShuffleMask(M, NumElems);
6870    std::swap(V1, V2);
6871    std::swap(V1IsSplat, V2IsSplat);
6872    Commuted = true;
6873  }
6874
6875  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
6876    // Shuffling low element of v1 into undef, just return v1.
6877    if (V2IsUndef)
6878      return V1;
6879    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
6880    // the instruction selector will not match, so get a canonical MOVL with
6881    // swapped operands to undo the commute.
6882    return getMOVL(DAG, dl, VT, V2, V1);
6883  }
6884
6885  if (isUNPCKLMask(M, VT, HasInt256))
6886    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6887
6888  if (isUNPCKHMask(M, VT, HasInt256))
6889    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6890
6891  if (V2IsSplat) {
6892    // Normalize mask so all entries that point to V2 points to its first
6893    // element then try to match unpck{h|l} again. If match, return a
6894    // new vector_shuffle with the corrected mask.p
6895    SmallVector<int, 8> NewMask(M.begin(), M.end());
6896    NormalizeMask(NewMask, NumElems);
6897    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
6898      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6899    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
6900      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6901  }
6902
6903  if (Commuted) {
6904    // Commute is back and try unpck* again.
6905    // FIXME: this seems wrong.
6906    CommuteVectorShuffleMask(M, NumElems);
6907    std::swap(V1, V2);
6908    std::swap(V1IsSplat, V2IsSplat);
6909    Commuted = false;
6910
6911    if (isUNPCKLMask(M, VT, HasInt256))
6912      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6913
6914    if (isUNPCKHMask(M, VT, HasInt256))
6915      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6916  }
6917
6918  // Normalize the node to match x86 shuffle ops if needed
6919  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
6920    return CommuteVectorShuffle(SVOp, DAG);
6921
6922  // The checks below are all present in isShuffleMaskLegal, but they are
6923  // inlined here right now to enable us to directly emit target specific
6924  // nodes, and remove one by one until they don't return Op anymore.
6925
6926  if (isPALIGNRMask(M, VT, Subtarget))
6927    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
6928                                getShufflePALIGNRImmediate(SVOp),
6929                                DAG);
6930
6931  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
6932      SVOp->getSplatIndex() == 0 && V2IsUndef) {
6933    if (VT == MVT::v2f64 || VT == MVT::v2i64)
6934      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6935  }
6936
6937  if (isPSHUFHWMask(M, VT, HasInt256))
6938    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
6939                                getShufflePSHUFHWImmediate(SVOp),
6940                                DAG);
6941
6942  if (isPSHUFLWMask(M, VT, HasInt256))
6943    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
6944                                getShufflePSHUFLWImmediate(SVOp),
6945                                DAG);
6946
6947  if (isSHUFPMask(M, VT, HasFp256))
6948    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
6949                                getShuffleSHUFImmediate(SVOp), DAG);
6950
6951  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
6952    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6953  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
6954    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6955
6956  //===--------------------------------------------------------------------===//
6957  // Generate target specific nodes for 128 or 256-bit shuffles only
6958  // supported in the AVX instruction set.
6959  //
6960
6961  // Handle VMOVDDUPY permutations
6962  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
6963    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
6964
6965  // Handle VPERMILPS/D* permutations
6966  if (isVPERMILPMask(M, VT, HasFp256)) {
6967    if (HasInt256 && VT == MVT::v8i32)
6968      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
6969                                  getShuffleSHUFImmediate(SVOp), DAG);
6970    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
6971                                getShuffleSHUFImmediate(SVOp), DAG);
6972  }
6973
6974  // Handle VPERM2F128/VPERM2I128 permutations
6975  if (isVPERM2X128Mask(M, VT, HasFp256))
6976    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
6977                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
6978
6979  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
6980  if (BlendOp.getNode())
6981    return BlendOp;
6982
6983  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
6984    SmallVector<SDValue, 8> permclMask;
6985    for (unsigned i = 0; i != 8; ++i) {
6986      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
6987    }
6988    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
6989                               &permclMask[0], 8);
6990    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
6991    return DAG.getNode(X86ISD::VPERMV, dl, VT,
6992                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
6993  }
6994
6995  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
6996    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
6997                                getShuffleCLImmediate(SVOp), DAG);
6998
6999
7000  //===--------------------------------------------------------------------===//
7001  // Since no target specific shuffle was selected for this generic one,
7002  // lower it into other known shuffles. FIXME: this isn't true yet, but
7003  // this is the plan.
7004  //
7005
7006  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
7007  if (VT == MVT::v8i16) {
7008    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
7009    if (NewOp.getNode())
7010      return NewOp;
7011  }
7012
7013  if (VT == MVT::v16i8) {
7014    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
7015    if (NewOp.getNode())
7016      return NewOp;
7017  }
7018
7019  if (VT == MVT::v32i8) {
7020    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
7021    if (NewOp.getNode())
7022      return NewOp;
7023  }
7024
7025  // Handle all 128-bit wide vectors with 4 elements, and match them with
7026  // several different shuffle types.
7027  if (NumElems == 4 && VT.is128BitVector())
7028    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
7029
7030  // Handle general 256-bit shuffles
7031  if (VT.is256BitVector())
7032    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
7033
7034  return SDValue();
7035}
7036
7037SDValue
7038X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
7039                                                SelectionDAG &DAG) const {
7040  EVT VT = Op.getValueType();
7041  DebugLoc dl = Op.getDebugLoc();
7042
7043  if (!Op.getOperand(0).getValueType().is128BitVector())
7044    return SDValue();
7045
7046  if (VT.getSizeInBits() == 8) {
7047    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
7048                                  Op.getOperand(0), Op.getOperand(1));
7049    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7050                                  DAG.getValueType(VT));
7051    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7052  }
7053
7054  if (VT.getSizeInBits() == 16) {
7055    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7056    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
7057    if (Idx == 0)
7058      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7059                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7060                                     DAG.getNode(ISD::BITCAST, dl,
7061                                                 MVT::v4i32,
7062                                                 Op.getOperand(0)),
7063                                     Op.getOperand(1)));
7064    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
7065                                  Op.getOperand(0), Op.getOperand(1));
7066    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7067                                  DAG.getValueType(VT));
7068    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7069  }
7070
7071  if (VT == MVT::f32) {
7072    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
7073    // the result back to FR32 register. It's only worth matching if the
7074    // result has a single use which is a store or a bitcast to i32.  And in
7075    // the case of a store, it's not worth it if the index is a constant 0,
7076    // because a MOVSSmr can be used instead, which is smaller and faster.
7077    if (!Op.hasOneUse())
7078      return SDValue();
7079    SDNode *User = *Op.getNode()->use_begin();
7080    if ((User->getOpcode() != ISD::STORE ||
7081         (isa<ConstantSDNode>(Op.getOperand(1)) &&
7082          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
7083        (User->getOpcode() != ISD::BITCAST ||
7084         User->getValueType(0) != MVT::i32))
7085      return SDValue();
7086    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7087                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
7088                                              Op.getOperand(0)),
7089                                              Op.getOperand(1));
7090    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
7091  }
7092
7093  if (VT == MVT::i32 || VT == MVT::i64) {
7094    // ExtractPS/pextrq works with constant index.
7095    if (isa<ConstantSDNode>(Op.getOperand(1)))
7096      return Op;
7097  }
7098  return SDValue();
7099}
7100
7101
7102SDValue
7103X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
7104                                           SelectionDAG &DAG) const {
7105  if (!isa<ConstantSDNode>(Op.getOperand(1)))
7106    return SDValue();
7107
7108  SDValue Vec = Op.getOperand(0);
7109  EVT VecVT = Vec.getValueType();
7110
7111  // If this is a 256-bit vector result, first extract the 128-bit vector and
7112  // then extract the element from the 128-bit vector.
7113  if (VecVT.is256BitVector()) {
7114    DebugLoc dl = Op.getNode()->getDebugLoc();
7115    unsigned NumElems = VecVT.getVectorNumElements();
7116    SDValue Idx = Op.getOperand(1);
7117    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7118
7119    // Get the 128-bit vector.
7120    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
7121
7122    if (IdxVal >= NumElems/2)
7123      IdxVal -= NumElems/2;
7124    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
7125                       DAG.getConstant(IdxVal, MVT::i32));
7126  }
7127
7128  assert(VecVT.is128BitVector() && "Unexpected vector length");
7129
7130  if (Subtarget->hasSSE41()) {
7131    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
7132    if (Res.getNode())
7133      return Res;
7134  }
7135
7136  EVT VT = Op.getValueType();
7137  DebugLoc dl = Op.getDebugLoc();
7138  // TODO: handle v16i8.
7139  if (VT.getSizeInBits() == 16) {
7140    SDValue Vec = Op.getOperand(0);
7141    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7142    if (Idx == 0)
7143      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7144                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7145                                     DAG.getNode(ISD::BITCAST, dl,
7146                                                 MVT::v4i32, Vec),
7147                                     Op.getOperand(1)));
7148    // Transform it so it match pextrw which produces a 32-bit result.
7149    EVT EltVT = MVT::i32;
7150    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
7151                                  Op.getOperand(0), Op.getOperand(1));
7152    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
7153                                  DAG.getValueType(VT));
7154    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7155  }
7156
7157  if (VT.getSizeInBits() == 32) {
7158    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7159    if (Idx == 0)
7160      return Op;
7161
7162    // SHUFPS the element to the lowest double word, then movss.
7163    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
7164    EVT VVT = Op.getOperand(0).getValueType();
7165    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7166                                       DAG.getUNDEF(VVT), Mask);
7167    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7168                       DAG.getIntPtrConstant(0));
7169  }
7170
7171  if (VT.getSizeInBits() == 64) {
7172    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
7173    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
7174    //        to match extract_elt for f64.
7175    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7176    if (Idx == 0)
7177      return Op;
7178
7179    // UNPCKHPD the element to the lowest double word, then movsd.
7180    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
7181    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
7182    int Mask[2] = { 1, -1 };
7183    EVT VVT = Op.getOperand(0).getValueType();
7184    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7185                                       DAG.getUNDEF(VVT), Mask);
7186    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7187                       DAG.getIntPtrConstant(0));
7188  }
7189
7190  return SDValue();
7191}
7192
7193SDValue
7194X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
7195                                               SelectionDAG &DAG) const {
7196  EVT VT = Op.getValueType();
7197  EVT EltVT = VT.getVectorElementType();
7198  DebugLoc dl = Op.getDebugLoc();
7199
7200  SDValue N0 = Op.getOperand(0);
7201  SDValue N1 = Op.getOperand(1);
7202  SDValue N2 = Op.getOperand(2);
7203
7204  if (!VT.is128BitVector())
7205    return SDValue();
7206
7207  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
7208      isa<ConstantSDNode>(N2)) {
7209    unsigned Opc;
7210    if (VT == MVT::v8i16)
7211      Opc = X86ISD::PINSRW;
7212    else if (VT == MVT::v16i8)
7213      Opc = X86ISD::PINSRB;
7214    else
7215      Opc = X86ISD::PINSRB;
7216
7217    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
7218    // argument.
7219    if (N1.getValueType() != MVT::i32)
7220      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7221    if (N2.getValueType() != MVT::i32)
7222      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7223    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
7224  }
7225
7226  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
7227    // Bits [7:6] of the constant are the source select.  This will always be
7228    //  zero here.  The DAG Combiner may combine an extract_elt index into these
7229    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
7230    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
7231    // Bits [5:4] of the constant are the destination select.  This is the
7232    //  value of the incoming immediate.
7233    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
7234    //   combine either bitwise AND or insert of float 0.0 to set these bits.
7235    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
7236    // Create this as a scalar to vector..
7237    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
7238    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
7239  }
7240
7241  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
7242    // PINSR* works with constant index.
7243    return Op;
7244  }
7245  return SDValue();
7246}
7247
7248SDValue
7249X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
7250  EVT VT = Op.getValueType();
7251  EVT EltVT = VT.getVectorElementType();
7252
7253  DebugLoc dl = Op.getDebugLoc();
7254  SDValue N0 = Op.getOperand(0);
7255  SDValue N1 = Op.getOperand(1);
7256  SDValue N2 = Op.getOperand(2);
7257
7258  // If this is a 256-bit vector result, first extract the 128-bit vector,
7259  // insert the element into the extracted half and then place it back.
7260  if (VT.is256BitVector()) {
7261    if (!isa<ConstantSDNode>(N2))
7262      return SDValue();
7263
7264    // Get the desired 128-bit vector half.
7265    unsigned NumElems = VT.getVectorNumElements();
7266    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
7267    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
7268
7269    // Insert the element into the desired half.
7270    bool Upper = IdxVal >= NumElems/2;
7271    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
7272                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
7273
7274    // Insert the changed part back to the 256-bit vector
7275    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
7276  }
7277
7278  if (Subtarget->hasSSE41())
7279    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
7280
7281  if (EltVT == MVT::i8)
7282    return SDValue();
7283
7284  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
7285    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
7286    // as its second argument.
7287    if (N1.getValueType() != MVT::i32)
7288      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7289    if (N2.getValueType() != MVT::i32)
7290      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7291    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
7292  }
7293  return SDValue();
7294}
7295
7296static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
7297  LLVMContext *Context = DAG.getContext();
7298  DebugLoc dl = Op.getDebugLoc();
7299  EVT OpVT = Op.getValueType();
7300
7301  // If this is a 256-bit vector result, first insert into a 128-bit
7302  // vector and then insert into the 256-bit vector.
7303  if (!OpVT.is128BitVector()) {
7304    // Insert into a 128-bit vector.
7305    EVT VT128 = EVT::getVectorVT(*Context,
7306                                 OpVT.getVectorElementType(),
7307                                 OpVT.getVectorNumElements() / 2);
7308
7309    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
7310
7311    // Insert the 128-bit vector.
7312    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
7313  }
7314
7315  if (OpVT == MVT::v1i64 &&
7316      Op.getOperand(0).getValueType() == MVT::i64)
7317    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
7318
7319  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
7320  assert(OpVT.is128BitVector() && "Expected an SSE type!");
7321  return DAG.getNode(ISD::BITCAST, dl, OpVT,
7322                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
7323}
7324
7325// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
7326// a simple subregister reference or explicit instructions to grab
7327// upper bits of a vector.
7328static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7329                                      SelectionDAG &DAG) {
7330  if (Subtarget->hasFp256()) {
7331    DebugLoc dl = Op.getNode()->getDebugLoc();
7332    SDValue Vec = Op.getNode()->getOperand(0);
7333    SDValue Idx = Op.getNode()->getOperand(1);
7334
7335    if (Op.getNode()->getValueType(0).is128BitVector() &&
7336        Vec.getNode()->getValueType(0).is256BitVector() &&
7337        isa<ConstantSDNode>(Idx)) {
7338      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7339      return Extract128BitVector(Vec, IdxVal, DAG, dl);
7340    }
7341  }
7342  return SDValue();
7343}
7344
7345// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
7346// simple superregister reference or explicit instructions to insert
7347// the upper bits of a vector.
7348static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7349                                     SelectionDAG &DAG) {
7350  if (Subtarget->hasFp256()) {
7351    DebugLoc dl = Op.getNode()->getDebugLoc();
7352    SDValue Vec = Op.getNode()->getOperand(0);
7353    SDValue SubVec = Op.getNode()->getOperand(1);
7354    SDValue Idx = Op.getNode()->getOperand(2);
7355
7356    if (Op.getNode()->getValueType(0).is256BitVector() &&
7357        SubVec.getNode()->getValueType(0).is128BitVector() &&
7358        isa<ConstantSDNode>(Idx)) {
7359      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7360      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7361    }
7362  }
7363  return SDValue();
7364}
7365
7366// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7367// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7368// one of the above mentioned nodes. It has to be wrapped because otherwise
7369// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7370// be used to form addressing mode. These wrapped nodes will be selected
7371// into MOV32ri.
7372SDValue
7373X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7374  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7375
7376  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7377  // global base reg.
7378  unsigned char OpFlag = 0;
7379  unsigned WrapperKind = X86ISD::Wrapper;
7380  CodeModel::Model M = getTargetMachine().getCodeModel();
7381
7382  if (Subtarget->isPICStyleRIPRel() &&
7383      (M == CodeModel::Small || M == CodeModel::Kernel))
7384    WrapperKind = X86ISD::WrapperRIP;
7385  else if (Subtarget->isPICStyleGOT())
7386    OpFlag = X86II::MO_GOTOFF;
7387  else if (Subtarget->isPICStyleStubPIC())
7388    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7389
7390  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7391                                             CP->getAlignment(),
7392                                             CP->getOffset(), OpFlag);
7393  DebugLoc DL = CP->getDebugLoc();
7394  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7395  // With PIC, the address is actually $g + Offset.
7396  if (OpFlag) {
7397    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7398                         DAG.getNode(X86ISD::GlobalBaseReg,
7399                                     DebugLoc(), getPointerTy()),
7400                         Result);
7401  }
7402
7403  return Result;
7404}
7405
7406SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7407  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7408
7409  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7410  // global base reg.
7411  unsigned char OpFlag = 0;
7412  unsigned WrapperKind = X86ISD::Wrapper;
7413  CodeModel::Model M = getTargetMachine().getCodeModel();
7414
7415  if (Subtarget->isPICStyleRIPRel() &&
7416      (M == CodeModel::Small || M == CodeModel::Kernel))
7417    WrapperKind = X86ISD::WrapperRIP;
7418  else if (Subtarget->isPICStyleGOT())
7419    OpFlag = X86II::MO_GOTOFF;
7420  else if (Subtarget->isPICStyleStubPIC())
7421    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7422
7423  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7424                                          OpFlag);
7425  DebugLoc DL = JT->getDebugLoc();
7426  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7427
7428  // With PIC, the address is actually $g + Offset.
7429  if (OpFlag)
7430    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7431                         DAG.getNode(X86ISD::GlobalBaseReg,
7432                                     DebugLoc(), getPointerTy()),
7433                         Result);
7434
7435  return Result;
7436}
7437
7438SDValue
7439X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
7440  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
7441
7442  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7443  // global base reg.
7444  unsigned char OpFlag = 0;
7445  unsigned WrapperKind = X86ISD::Wrapper;
7446  CodeModel::Model M = getTargetMachine().getCodeModel();
7447
7448  if (Subtarget->isPICStyleRIPRel() &&
7449      (M == CodeModel::Small || M == CodeModel::Kernel)) {
7450    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
7451      OpFlag = X86II::MO_GOTPCREL;
7452    WrapperKind = X86ISD::WrapperRIP;
7453  } else if (Subtarget->isPICStyleGOT()) {
7454    OpFlag = X86II::MO_GOT;
7455  } else if (Subtarget->isPICStyleStubPIC()) {
7456    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
7457  } else if (Subtarget->isPICStyleStubNoDynamic()) {
7458    OpFlag = X86II::MO_DARWIN_NONLAZY;
7459  }
7460
7461  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
7462
7463  DebugLoc DL = Op.getDebugLoc();
7464  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7465
7466
7467  // With PIC, the address is actually $g + Offset.
7468  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
7469      !Subtarget->is64Bit()) {
7470    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7471                         DAG.getNode(X86ISD::GlobalBaseReg,
7472                                     DebugLoc(), getPointerTy()),
7473                         Result);
7474  }
7475
7476  // For symbols that require a load from a stub to get the address, emit the
7477  // load.
7478  if (isGlobalStubReference(OpFlag))
7479    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
7480                         MachinePointerInfo::getGOT(), false, false, false, 0);
7481
7482  return Result;
7483}
7484
7485SDValue
7486X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
7487  // Create the TargetBlockAddressAddress node.
7488  unsigned char OpFlags =
7489    Subtarget->ClassifyBlockAddressReference();
7490  CodeModel::Model M = getTargetMachine().getCodeModel();
7491  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
7492  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
7493  DebugLoc dl = Op.getDebugLoc();
7494  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
7495                                             OpFlags);
7496
7497  if (Subtarget->isPICStyleRIPRel() &&
7498      (M == CodeModel::Small || M == CodeModel::Kernel))
7499    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7500  else
7501    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7502
7503  // With PIC, the address is actually $g + Offset.
7504  if (isGlobalRelativeToPICBase(OpFlags)) {
7505    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7506                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7507                         Result);
7508  }
7509
7510  return Result;
7511}
7512
7513SDValue
7514X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
7515                                      int64_t Offset,
7516                                      SelectionDAG &DAG) const {
7517  // Create the TargetGlobalAddress node, folding in the constant
7518  // offset if it is legal.
7519  unsigned char OpFlags =
7520    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7521  CodeModel::Model M = getTargetMachine().getCodeModel();
7522  SDValue Result;
7523  if (OpFlags == X86II::MO_NO_FLAG &&
7524      X86::isOffsetSuitableForCodeModel(Offset, M)) {
7525    // A direct static reference to a global.
7526    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
7527    Offset = 0;
7528  } else {
7529    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
7530  }
7531
7532  if (Subtarget->isPICStyleRIPRel() &&
7533      (M == CodeModel::Small || M == CodeModel::Kernel))
7534    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7535  else
7536    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7537
7538  // With PIC, the address is actually $g + Offset.
7539  if (isGlobalRelativeToPICBase(OpFlags)) {
7540    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7541                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7542                         Result);
7543  }
7544
7545  // For globals that require a load from a stub to get the address, emit the
7546  // load.
7547  if (isGlobalStubReference(OpFlags))
7548    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
7549                         MachinePointerInfo::getGOT(), false, false, false, 0);
7550
7551  // If there was a non-zero offset that we didn't fold, create an explicit
7552  // addition for it.
7553  if (Offset != 0)
7554    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
7555                         DAG.getConstant(Offset, getPointerTy()));
7556
7557  return Result;
7558}
7559
7560SDValue
7561X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
7562  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7563  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
7564  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
7565}
7566
7567static SDValue
7568GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
7569           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
7570           unsigned char OperandFlags, bool LocalDynamic = false) {
7571  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7572  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7573  DebugLoc dl = GA->getDebugLoc();
7574  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7575                                           GA->getValueType(0),
7576                                           GA->getOffset(),
7577                                           OperandFlags);
7578
7579  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
7580                                           : X86ISD::TLSADDR;
7581
7582  if (InFlag) {
7583    SDValue Ops[] = { Chain,  TGA, *InFlag };
7584    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
7585  } else {
7586    SDValue Ops[]  = { Chain, TGA };
7587    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
7588  }
7589
7590  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
7591  MFI->setAdjustsStack(true);
7592
7593  SDValue Flag = Chain.getValue(1);
7594  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
7595}
7596
7597// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
7598static SDValue
7599LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7600                                const EVT PtrVT) {
7601  SDValue InFlag;
7602  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
7603  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7604                                   DAG.getNode(X86ISD::GlobalBaseReg,
7605                                               DebugLoc(), PtrVT), InFlag);
7606  InFlag = Chain.getValue(1);
7607
7608  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
7609}
7610
7611// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
7612static SDValue
7613LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7614                                const EVT PtrVT) {
7615  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
7616                    X86::RAX, X86II::MO_TLSGD);
7617}
7618
7619static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
7620                                           SelectionDAG &DAG,
7621                                           const EVT PtrVT,
7622                                           bool is64Bit) {
7623  DebugLoc dl = GA->getDebugLoc();
7624
7625  // Get the start address of the TLS block for this module.
7626  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
7627      .getInfo<X86MachineFunctionInfo>();
7628  MFI->incNumLocalDynamicTLSAccesses();
7629
7630  SDValue Base;
7631  if (is64Bit) {
7632    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
7633                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
7634  } else {
7635    SDValue InFlag;
7636    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7637        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
7638    InFlag = Chain.getValue(1);
7639    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
7640                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
7641  }
7642
7643  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
7644  // of Base.
7645
7646  // Build x@dtpoff.
7647  unsigned char OperandFlags = X86II::MO_DTPOFF;
7648  unsigned WrapperKind = X86ISD::Wrapper;
7649  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7650                                           GA->getValueType(0),
7651                                           GA->getOffset(), OperandFlags);
7652  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7653
7654  // Add x@dtpoff with the base.
7655  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
7656}
7657
7658// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
7659static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7660                                   const EVT PtrVT, TLSModel::Model model,
7661                                   bool is64Bit, bool isPIC) {
7662  DebugLoc dl = GA->getDebugLoc();
7663
7664  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
7665  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
7666                                                         is64Bit ? 257 : 256));
7667
7668  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
7669                                      DAG.getIntPtrConstant(0),
7670                                      MachinePointerInfo(Ptr),
7671                                      false, false, false, 0);
7672
7673  unsigned char OperandFlags = 0;
7674  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
7675  // initialexec.
7676  unsigned WrapperKind = X86ISD::Wrapper;
7677  if (model == TLSModel::LocalExec) {
7678    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
7679  } else if (model == TLSModel::InitialExec) {
7680    if (is64Bit) {
7681      OperandFlags = X86II::MO_GOTTPOFF;
7682      WrapperKind = X86ISD::WrapperRIP;
7683    } else {
7684      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
7685    }
7686  } else {
7687    llvm_unreachable("Unexpected model");
7688  }
7689
7690  // emit "addl x@ntpoff,%eax" (local exec)
7691  // or "addl x@indntpoff,%eax" (initial exec)
7692  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
7693  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7694                                           GA->getValueType(0),
7695                                           GA->getOffset(), OperandFlags);
7696  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7697
7698  if (model == TLSModel::InitialExec) {
7699    if (isPIC && !is64Bit) {
7700      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
7701                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
7702                           Offset);
7703    }
7704
7705    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
7706                         MachinePointerInfo::getGOT(), false, false, false,
7707                         0);
7708  }
7709
7710  // The address of the thread local variable is the add of the thread
7711  // pointer with the offset of the variable.
7712  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
7713}
7714
7715SDValue
7716X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
7717
7718  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7719  const GlobalValue *GV = GA->getGlobal();
7720
7721  if (Subtarget->isTargetELF()) {
7722    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
7723
7724    switch (model) {
7725      case TLSModel::GeneralDynamic:
7726        if (Subtarget->is64Bit())
7727          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
7728        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
7729      case TLSModel::LocalDynamic:
7730        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
7731                                           Subtarget->is64Bit());
7732      case TLSModel::InitialExec:
7733      case TLSModel::LocalExec:
7734        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
7735                                   Subtarget->is64Bit(),
7736                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
7737    }
7738    llvm_unreachable("Unknown TLS model.");
7739  }
7740
7741  if (Subtarget->isTargetDarwin()) {
7742    // Darwin only has one model of TLS.  Lower to that.
7743    unsigned char OpFlag = 0;
7744    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
7745                           X86ISD::WrapperRIP : X86ISD::Wrapper;
7746
7747    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7748    // global base reg.
7749    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
7750                  !Subtarget->is64Bit();
7751    if (PIC32)
7752      OpFlag = X86II::MO_TLVP_PIC_BASE;
7753    else
7754      OpFlag = X86II::MO_TLVP;
7755    DebugLoc DL = Op.getDebugLoc();
7756    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
7757                                                GA->getValueType(0),
7758                                                GA->getOffset(), OpFlag);
7759    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7760
7761    // With PIC32, the address is actually $g + Offset.
7762    if (PIC32)
7763      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7764                           DAG.getNode(X86ISD::GlobalBaseReg,
7765                                       DebugLoc(), getPointerTy()),
7766                           Offset);
7767
7768    // Lowering the machine isd will make sure everything is in the right
7769    // location.
7770    SDValue Chain = DAG.getEntryNode();
7771    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7772    SDValue Args[] = { Chain, Offset };
7773    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
7774
7775    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
7776    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7777    MFI->setAdjustsStack(true);
7778
7779    // And our return value (tls address) is in the standard call return value
7780    // location.
7781    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
7782    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
7783                              Chain.getValue(1));
7784  }
7785
7786  if (Subtarget->isTargetWindows()) {
7787    // Just use the implicit TLS architecture
7788    // Need to generate someting similar to:
7789    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
7790    //                                  ; from TEB
7791    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
7792    //   mov     rcx, qword [rdx+rcx*8]
7793    //   mov     eax, .tls$:tlsvar
7794    //   [rax+rcx] contains the address
7795    // Windows 64bit: gs:0x58
7796    // Windows 32bit: fs:__tls_array
7797
7798    // If GV is an alias then use the aliasee for determining
7799    // thread-localness.
7800    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7801      GV = GA->resolveAliasedGlobal(false);
7802    DebugLoc dl = GA->getDebugLoc();
7803    SDValue Chain = DAG.getEntryNode();
7804
7805    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
7806    // %gs:0x58 (64-bit).
7807    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
7808                                        ? Type::getInt8PtrTy(*DAG.getContext(),
7809                                                             256)
7810                                        : Type::getInt32PtrTy(*DAG.getContext(),
7811                                                              257));
7812
7813    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
7814                                        Subtarget->is64Bit()
7815                                        ? DAG.getIntPtrConstant(0x58)
7816                                        : DAG.getExternalSymbol("_tls_array",
7817                                                                getPointerTy()),
7818                                        MachinePointerInfo(Ptr),
7819                                        false, false, false, 0);
7820
7821    // Load the _tls_index variable
7822    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
7823    if (Subtarget->is64Bit())
7824      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
7825                           IDX, MachinePointerInfo(), MVT::i32,
7826                           false, false, 0);
7827    else
7828      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
7829                        false, false, false, 0);
7830
7831    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
7832                                    getPointerTy());
7833    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
7834
7835    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
7836    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
7837                      false, false, false, 0);
7838
7839    // Get the offset of start of .tls section
7840    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7841                                             GA->getValueType(0),
7842                                             GA->getOffset(), X86II::MO_SECREL);
7843    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
7844
7845    // The address of the thread local variable is the add of the thread
7846    // pointer with the offset of the variable.
7847    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
7848  }
7849
7850  llvm_unreachable("TLS not implemented for this target.");
7851}
7852
7853
7854/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
7855/// and take a 2 x i32 value to shift plus a shift amount.
7856SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
7857  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7858  EVT VT = Op.getValueType();
7859  unsigned VTBits = VT.getSizeInBits();
7860  DebugLoc dl = Op.getDebugLoc();
7861  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
7862  SDValue ShOpLo = Op.getOperand(0);
7863  SDValue ShOpHi = Op.getOperand(1);
7864  SDValue ShAmt  = Op.getOperand(2);
7865  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
7866                                     DAG.getConstant(VTBits - 1, MVT::i8))
7867                       : DAG.getConstant(0, VT);
7868
7869  SDValue Tmp2, Tmp3;
7870  if (Op.getOpcode() == ISD::SHL_PARTS) {
7871    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
7872    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7873  } else {
7874    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
7875    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
7876  }
7877
7878  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
7879                                DAG.getConstant(VTBits, MVT::i8));
7880  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7881                             AndNode, DAG.getConstant(0, MVT::i8));
7882
7883  SDValue Hi, Lo;
7884  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7885  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
7886  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
7887
7888  if (Op.getOpcode() == ISD::SHL_PARTS) {
7889    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7890    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7891  } else {
7892    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7893    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7894  }
7895
7896  SDValue Ops[2] = { Lo, Hi };
7897  return DAG.getMergeValues(Ops, 2, dl);
7898}
7899
7900SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
7901                                           SelectionDAG &DAG) const {
7902  EVT SrcVT = Op.getOperand(0).getValueType();
7903
7904  if (SrcVT.isVector())
7905    return SDValue();
7906
7907  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
7908         "Unknown SINT_TO_FP to lower!");
7909
7910  // These are really Legal; return the operand so the caller accepts it as
7911  // Legal.
7912  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
7913    return Op;
7914  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
7915      Subtarget->is64Bit()) {
7916    return Op;
7917  }
7918
7919  DebugLoc dl = Op.getDebugLoc();
7920  unsigned Size = SrcVT.getSizeInBits()/8;
7921  MachineFunction &MF = DAG.getMachineFunction();
7922  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
7923  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7924  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7925                               StackSlot,
7926                               MachinePointerInfo::getFixedStack(SSFI),
7927                               false, false, 0);
7928  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
7929}
7930
7931SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
7932                                     SDValue StackSlot,
7933                                     SelectionDAG &DAG) const {
7934  // Build the FILD
7935  DebugLoc DL = Op.getDebugLoc();
7936  SDVTList Tys;
7937  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
7938  if (useSSE)
7939    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
7940  else
7941    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
7942
7943  unsigned ByteSize = SrcVT.getSizeInBits()/8;
7944
7945  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
7946  MachineMemOperand *MMO;
7947  if (FI) {
7948    int SSFI = FI->getIndex();
7949    MMO =
7950      DAG.getMachineFunction()
7951      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7952                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
7953  } else {
7954    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
7955    StackSlot = StackSlot.getOperand(1);
7956  }
7957  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
7958  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
7959                                           X86ISD::FILD, DL,
7960                                           Tys, Ops, array_lengthof(Ops),
7961                                           SrcVT, MMO);
7962
7963  if (useSSE) {
7964    Chain = Result.getValue(1);
7965    SDValue InFlag = Result.getValue(2);
7966
7967    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
7968    // shouldn't be necessary except that RFP cannot be live across
7969    // multiple blocks. When stackifier is fixed, they can be uncoupled.
7970    MachineFunction &MF = DAG.getMachineFunction();
7971    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
7972    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
7973    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7974    Tys = DAG.getVTList(MVT::Other);
7975    SDValue Ops[] = {
7976      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
7977    };
7978    MachineMemOperand *MMO =
7979      DAG.getMachineFunction()
7980      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7981                            MachineMemOperand::MOStore, SSFISize, SSFISize);
7982
7983    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
7984                                    Ops, array_lengthof(Ops),
7985                                    Op.getValueType(), MMO);
7986    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
7987                         MachinePointerInfo::getFixedStack(SSFI),
7988                         false, false, false, 0);
7989  }
7990
7991  return Result;
7992}
7993
7994// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
7995SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
7996                                               SelectionDAG &DAG) const {
7997  // This algorithm is not obvious. Here it is what we're trying to output:
7998  /*
7999     movq       %rax,  %xmm0
8000     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
8001     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
8002     #ifdef __SSE3__
8003       haddpd   %xmm0, %xmm0
8004     #else
8005       pshufd   $0x4e, %xmm0, %xmm1
8006       addpd    %xmm1, %xmm0
8007     #endif
8008  */
8009
8010  DebugLoc dl = Op.getDebugLoc();
8011  LLVMContext *Context = DAG.getContext();
8012
8013  // Build some magic constants.
8014  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
8015  Constant *C0 = ConstantDataVector::get(*Context, CV0);
8016  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
8017
8018  SmallVector<Constant*,2> CV1;
8019  CV1.push_back(
8020        ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
8021  CV1.push_back(
8022        ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
8023  Constant *C1 = ConstantVector::get(CV1);
8024  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
8025
8026  // Load the 64-bit value into an XMM register.
8027  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
8028                            Op.getOperand(0));
8029  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
8030                              MachinePointerInfo::getConstantPool(),
8031                              false, false, false, 16);
8032  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
8033                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
8034                              CLod0);
8035
8036  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
8037                              MachinePointerInfo::getConstantPool(),
8038                              false, false, false, 16);
8039  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
8040  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
8041  SDValue Result;
8042
8043  if (Subtarget->hasSSE3()) {
8044    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
8045    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
8046  } else {
8047    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
8048    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
8049                                           S2F, 0x4E, DAG);
8050    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
8051                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
8052                         Sub);
8053  }
8054
8055  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
8056                     DAG.getIntPtrConstant(0));
8057}
8058
8059// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
8060SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
8061                                               SelectionDAG &DAG) const {
8062  DebugLoc dl = Op.getDebugLoc();
8063  // FP constant to bias correct the final result.
8064  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
8065                                   MVT::f64);
8066
8067  // Load the 32-bit value into an XMM register.
8068  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
8069                             Op.getOperand(0));
8070
8071  // Zero out the upper parts of the register.
8072  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
8073
8074  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8075                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
8076                     DAG.getIntPtrConstant(0));
8077
8078  // Or the load with the bias.
8079  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
8080                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8081                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8082                                                   MVT::v2f64, Load)),
8083                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8084                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8085                                                   MVT::v2f64, Bias)));
8086  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8087                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
8088                   DAG.getIntPtrConstant(0));
8089
8090  // Subtract the bias.
8091  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
8092
8093  // Handle final rounding.
8094  EVT DestVT = Op.getValueType();
8095
8096  if (DestVT.bitsLT(MVT::f64))
8097    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
8098                       DAG.getIntPtrConstant(0));
8099  if (DestVT.bitsGT(MVT::f64))
8100    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
8101
8102  // Handle final rounding.
8103  return Sub;
8104}
8105
8106SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
8107                                               SelectionDAG &DAG) const {
8108  SDValue N0 = Op.getOperand(0);
8109  EVT SVT = N0.getValueType();
8110  DebugLoc dl = Op.getDebugLoc();
8111
8112  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
8113          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
8114         "Custom UINT_TO_FP is not supported!");
8115
8116  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
8117  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
8118                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
8119}
8120
8121SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
8122                                           SelectionDAG &DAG) const {
8123  SDValue N0 = Op.getOperand(0);
8124  DebugLoc dl = Op.getDebugLoc();
8125
8126  if (Op.getValueType().isVector())
8127    return lowerUINT_TO_FP_vec(Op, DAG);
8128
8129  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
8130  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
8131  // the optimization here.
8132  if (DAG.SignBitIsZero(N0))
8133    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
8134
8135  EVT SrcVT = N0.getValueType();
8136  EVT DstVT = Op.getValueType();
8137  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
8138    return LowerUINT_TO_FP_i64(Op, DAG);
8139  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
8140    return LowerUINT_TO_FP_i32(Op, DAG);
8141  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
8142    return SDValue();
8143
8144  // Make a 64-bit buffer, and use it to build an FILD.
8145  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
8146  if (SrcVT == MVT::i32) {
8147    SDValue WordOff = DAG.getConstant(4, getPointerTy());
8148    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
8149                                     getPointerTy(), StackSlot, WordOff);
8150    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8151                                  StackSlot, MachinePointerInfo(),
8152                                  false, false, 0);
8153    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
8154                                  OffsetSlot, MachinePointerInfo(),
8155                                  false, false, 0);
8156    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
8157    return Fild;
8158  }
8159
8160  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
8161  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8162                               StackSlot, MachinePointerInfo(),
8163                               false, false, 0);
8164  // For i64 source, we need to add the appropriate power of 2 if the input
8165  // was negative.  This is the same as the optimization in
8166  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
8167  // we must be careful to do the computation in x87 extended precision, not
8168  // in SSE. (The generic code can't know it's OK to do this, or how to.)
8169  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
8170  MachineMemOperand *MMO =
8171    DAG.getMachineFunction()
8172    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8173                          MachineMemOperand::MOLoad, 8, 8);
8174
8175  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
8176  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
8177  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
8178                                         MVT::i64, MMO);
8179
8180  APInt FF(32, 0x5F800000ULL);
8181
8182  // Check whether the sign bit is set.
8183  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
8184                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
8185                                 ISD::SETLT);
8186
8187  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
8188  SDValue FudgePtr = DAG.getConstantPool(
8189                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
8190                                         getPointerTy());
8191
8192  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
8193  SDValue Zero = DAG.getIntPtrConstant(0);
8194  SDValue Four = DAG.getIntPtrConstant(4);
8195  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
8196                               Zero, Four);
8197  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
8198
8199  // Load the value out, extending it from f32 to f80.
8200  // FIXME: Avoid the extend by constructing the right constant pool?
8201  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
8202                                 FudgePtr, MachinePointerInfo::getConstantPool(),
8203                                 MVT::f32, false, false, 4);
8204  // Extend everything to 80 bits to force it to be done on x87.
8205  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
8206  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
8207}
8208
8209std::pair<SDValue,SDValue> X86TargetLowering::
8210FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
8211  DebugLoc DL = Op.getDebugLoc();
8212
8213  EVT DstTy = Op.getValueType();
8214
8215  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
8216    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
8217    DstTy = MVT::i64;
8218  }
8219
8220  assert(DstTy.getSimpleVT() <= MVT::i64 &&
8221         DstTy.getSimpleVT() >= MVT::i16 &&
8222         "Unknown FP_TO_INT to lower!");
8223
8224  // These are really Legal.
8225  if (DstTy == MVT::i32 &&
8226      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8227    return std::make_pair(SDValue(), SDValue());
8228  if (Subtarget->is64Bit() &&
8229      DstTy == MVT::i64 &&
8230      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8231    return std::make_pair(SDValue(), SDValue());
8232
8233  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
8234  // stack slot, or into the FTOL runtime function.
8235  MachineFunction &MF = DAG.getMachineFunction();
8236  unsigned MemSize = DstTy.getSizeInBits()/8;
8237  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8238  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8239
8240  unsigned Opc;
8241  if (!IsSigned && isIntegerTypeFTOL(DstTy))
8242    Opc = X86ISD::WIN_FTOL;
8243  else
8244    switch (DstTy.getSimpleVT().SimpleTy) {
8245    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
8246    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
8247    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
8248    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
8249    }
8250
8251  SDValue Chain = DAG.getEntryNode();
8252  SDValue Value = Op.getOperand(0);
8253  EVT TheVT = Op.getOperand(0).getValueType();
8254  // FIXME This causes a redundant load/store if the SSE-class value is already
8255  // in memory, such as if it is on the callstack.
8256  if (isScalarFPTypeInSSEReg(TheVT)) {
8257    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
8258    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
8259                         MachinePointerInfo::getFixedStack(SSFI),
8260                         false, false, 0);
8261    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
8262    SDValue Ops[] = {
8263      Chain, StackSlot, DAG.getValueType(TheVT)
8264    };
8265
8266    MachineMemOperand *MMO =
8267      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8268                              MachineMemOperand::MOLoad, MemSize, MemSize);
8269    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
8270                                    DstTy, MMO);
8271    Chain = Value.getValue(1);
8272    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8273    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8274  }
8275
8276  MachineMemOperand *MMO =
8277    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8278                            MachineMemOperand::MOStore, MemSize, MemSize);
8279
8280  if (Opc != X86ISD::WIN_FTOL) {
8281    // Build the FP_TO_INT*_IN_MEM
8282    SDValue Ops[] = { Chain, Value, StackSlot };
8283    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
8284                                           Ops, 3, DstTy, MMO);
8285    return std::make_pair(FIST, StackSlot);
8286  } else {
8287    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
8288      DAG.getVTList(MVT::Other, MVT::Glue),
8289      Chain, Value);
8290    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
8291      MVT::i32, ftol.getValue(1));
8292    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
8293      MVT::i32, eax.getValue(2));
8294    SDValue Ops[] = { eax, edx };
8295    SDValue pair = IsReplace
8296      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
8297      : DAG.getMergeValues(Ops, 2, DL);
8298    return std::make_pair(pair, SDValue());
8299  }
8300}
8301
8302SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
8303  DebugLoc DL = Op.getDebugLoc();
8304  EVT VT = Op.getValueType();
8305  SDValue In = Op.getOperand(0);
8306  EVT SVT = In.getValueType();
8307
8308  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
8309      VT.getVectorNumElements() != SVT.getVectorNumElements())
8310    return SDValue();
8311
8312  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
8313
8314  // AVX2 has better support of integer extending.
8315  if (Subtarget->hasInt256())
8316    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
8317
8318  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
8319  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
8320  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
8321                           DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
8322
8323  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
8324}
8325
8326SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8327  DebugLoc DL = Op.getDebugLoc();
8328  EVT VT = Op.getValueType();
8329  EVT SVT = Op.getOperand(0).getValueType();
8330
8331  if (!VT.is128BitVector() || !SVT.is256BitVector() ||
8332      VT.getVectorNumElements() != SVT.getVectorNumElements())
8333    return SDValue();
8334
8335  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
8336
8337  unsigned NumElems = VT.getVectorNumElements();
8338  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
8339                             NumElems * 2);
8340
8341  SDValue In = Op.getOperand(0);
8342  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
8343  // Prepare truncation shuffle mask
8344  for (unsigned i = 0; i != NumElems; ++i)
8345    MaskVec[i] = i * 2;
8346  SDValue V = DAG.getVectorShuffle(NVT, DL,
8347                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
8348                                   DAG.getUNDEF(NVT), &MaskVec[0]);
8349  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
8350                     DAG.getIntPtrConstant(0));
8351}
8352
8353SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
8354                                           SelectionDAG &DAG) const {
8355  if (Op.getValueType().isVector()) {
8356    if (Op.getValueType() == MVT::v8i16)
8357      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
8358                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
8359                                     MVT::v8i32, Op.getOperand(0)));
8360    return SDValue();
8361  }
8362
8363  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8364    /*IsSigned=*/ true, /*IsReplace=*/ false);
8365  SDValue FIST = Vals.first, StackSlot = Vals.second;
8366  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
8367  if (FIST.getNode() == 0) return Op;
8368
8369  if (StackSlot.getNode())
8370    // Load the result.
8371    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
8372                       FIST, StackSlot, MachinePointerInfo(),
8373                       false, false, false, 0);
8374
8375  // The node is the result.
8376  return FIST;
8377}
8378
8379SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
8380                                           SelectionDAG &DAG) const {
8381  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8382    /*IsSigned=*/ false, /*IsReplace=*/ false);
8383  SDValue FIST = Vals.first, StackSlot = Vals.second;
8384  assert(FIST.getNode() && "Unexpected failure");
8385
8386  if (StackSlot.getNode())
8387    // Load the result.
8388    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
8389                       FIST, StackSlot, MachinePointerInfo(),
8390                       false, false, false, 0);
8391
8392  // The node is the result.
8393  return FIST;
8394}
8395
8396SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
8397                                          SelectionDAG &DAG) const {
8398  DebugLoc DL = Op.getDebugLoc();
8399  EVT VT = Op.getValueType();
8400  SDValue In = Op.getOperand(0);
8401  EVT SVT = In.getValueType();
8402
8403  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
8404
8405  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
8406                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
8407                                 In, DAG.getUNDEF(SVT)));
8408}
8409
8410SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
8411  LLVMContext *Context = DAG.getContext();
8412  DebugLoc dl = Op.getDebugLoc();
8413  EVT VT = Op.getValueType();
8414  EVT EltVT = VT;
8415  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8416  if (VT.isVector()) {
8417    EltVT = VT.getVectorElementType();
8418    NumElts = VT.getVectorNumElements();
8419  }
8420  Constant *C;
8421  if (EltVT == MVT::f64)
8422    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
8423  else
8424    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
8425  C = ConstantVector::getSplat(NumElts, C);
8426  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
8427  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
8428  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8429                             MachinePointerInfo::getConstantPool(),
8430                             false, false, false, Alignment);
8431  if (VT.isVector()) {
8432    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8433    return DAG.getNode(ISD::BITCAST, dl, VT,
8434                       DAG.getNode(ISD::AND, dl, ANDVT,
8435                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
8436                                               Op.getOperand(0)),
8437                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
8438  }
8439  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
8440}
8441
8442SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
8443  LLVMContext *Context = DAG.getContext();
8444  DebugLoc dl = Op.getDebugLoc();
8445  EVT VT = Op.getValueType();
8446  EVT EltVT = VT;
8447  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8448  if (VT.isVector()) {
8449    EltVT = VT.getVectorElementType();
8450    NumElts = VT.getVectorNumElements();
8451  }
8452  Constant *C;
8453  if (EltVT == MVT::f64)
8454    C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
8455  else
8456    C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
8457  C = ConstantVector::getSplat(NumElts, C);
8458  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
8459  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
8460  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8461                             MachinePointerInfo::getConstantPool(),
8462                             false, false, false, Alignment);
8463  if (VT.isVector()) {
8464    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8465    return DAG.getNode(ISD::BITCAST, dl, VT,
8466                       DAG.getNode(ISD::XOR, dl, XORVT,
8467                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
8468                                               Op.getOperand(0)),
8469                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
8470  }
8471
8472  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
8473}
8474
8475SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8476  LLVMContext *Context = DAG.getContext();
8477  SDValue Op0 = Op.getOperand(0);
8478  SDValue Op1 = Op.getOperand(1);
8479  DebugLoc dl = Op.getDebugLoc();
8480  EVT VT = Op.getValueType();
8481  EVT SrcVT = Op1.getValueType();
8482
8483  // If second operand is smaller, extend it first.
8484  if (SrcVT.bitsLT(VT)) {
8485    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
8486    SrcVT = VT;
8487  }
8488  // And if it is bigger, shrink it first.
8489  if (SrcVT.bitsGT(VT)) {
8490    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
8491    SrcVT = VT;
8492  }
8493
8494  // At this point the operands and the result should have the same
8495  // type, and that won't be f80 since that is not custom lowered.
8496
8497  // First get the sign bit of second operand.
8498  SmallVector<Constant*,4> CV;
8499  if (SrcVT == MVT::f64) {
8500    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
8501    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8502  } else {
8503    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
8504    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8505    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8506    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8507  }
8508  Constant *C = ConstantVector::get(CV);
8509  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8510  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
8511                              MachinePointerInfo::getConstantPool(),
8512                              false, false, false, 16);
8513  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
8514
8515  // Shift sign bit right or left if the two operands have different types.
8516  if (SrcVT.bitsGT(VT)) {
8517    // Op0 is MVT::f32, Op1 is MVT::f64.
8518    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
8519    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
8520                          DAG.getConstant(32, MVT::i32));
8521    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
8522    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
8523                          DAG.getIntPtrConstant(0));
8524  }
8525
8526  // Clear first operand sign bit.
8527  CV.clear();
8528  if (VT == MVT::f64) {
8529    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
8530    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8531  } else {
8532    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
8533    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8534    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8535    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8536  }
8537  C = ConstantVector::get(CV);
8538  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8539  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8540                              MachinePointerInfo::getConstantPool(),
8541                              false, false, false, 16);
8542  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
8543
8544  // Or the value with the sign bit.
8545  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
8546}
8547
8548static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
8549  SDValue N0 = Op.getOperand(0);
8550  DebugLoc dl = Op.getDebugLoc();
8551  EVT VT = Op.getValueType();
8552
8553  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
8554  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
8555                                  DAG.getConstant(1, VT));
8556  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
8557}
8558
8559// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
8560//
8561SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
8562  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
8563
8564  if (!Subtarget->hasSSE41())
8565    return SDValue();
8566
8567  if (!Op->hasOneUse())
8568    return SDValue();
8569
8570  SDNode *N = Op.getNode();
8571  DebugLoc DL = N->getDebugLoc();
8572
8573  SmallVector<SDValue, 8> Opnds;
8574  DenseMap<SDValue, unsigned> VecInMap;
8575  EVT VT = MVT::Other;
8576
8577  // Recognize a special case where a vector is casted into wide integer to
8578  // test all 0s.
8579  Opnds.push_back(N->getOperand(0));
8580  Opnds.push_back(N->getOperand(1));
8581
8582  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
8583    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
8584    // BFS traverse all OR'd operands.
8585    if (I->getOpcode() == ISD::OR) {
8586      Opnds.push_back(I->getOperand(0));
8587      Opnds.push_back(I->getOperand(1));
8588      // Re-evaluate the number of nodes to be traversed.
8589      e += 2; // 2 more nodes (LHS and RHS) are pushed.
8590      continue;
8591    }
8592
8593    // Quit if a non-EXTRACT_VECTOR_ELT
8594    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8595      return SDValue();
8596
8597    // Quit if without a constant index.
8598    SDValue Idx = I->getOperand(1);
8599    if (!isa<ConstantSDNode>(Idx))
8600      return SDValue();
8601
8602    SDValue ExtractedFromVec = I->getOperand(0);
8603    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
8604    if (M == VecInMap.end()) {
8605      VT = ExtractedFromVec.getValueType();
8606      // Quit if not 128/256-bit vector.
8607      if (!VT.is128BitVector() && !VT.is256BitVector())
8608        return SDValue();
8609      // Quit if not the same type.
8610      if (VecInMap.begin() != VecInMap.end() &&
8611          VT != VecInMap.begin()->first.getValueType())
8612        return SDValue();
8613      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
8614    }
8615    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
8616  }
8617
8618  assert((VT.is128BitVector() || VT.is256BitVector()) &&
8619         "Not extracted from 128-/256-bit vector.");
8620
8621  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
8622  SmallVector<SDValue, 8> VecIns;
8623
8624  for (DenseMap<SDValue, unsigned>::const_iterator
8625        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
8626    // Quit if not all elements are used.
8627    if (I->second != FullMask)
8628      return SDValue();
8629    VecIns.push_back(I->first);
8630  }
8631
8632  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8633
8634  // Cast all vectors into TestVT for PTEST.
8635  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
8636    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
8637
8638  // If more than one full vectors are evaluated, OR them first before PTEST.
8639  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
8640    // Each iteration will OR 2 nodes and append the result until there is only
8641    // 1 node left, i.e. the final OR'd value of all vectors.
8642    SDValue LHS = VecIns[Slot];
8643    SDValue RHS = VecIns[Slot + 1];
8644    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
8645  }
8646
8647  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
8648                     VecIns.back(), VecIns.back());
8649}
8650
8651/// Emit nodes that will be selected as "test Op0,Op0", or something
8652/// equivalent.
8653SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
8654                                    SelectionDAG &DAG) const {
8655  DebugLoc dl = Op.getDebugLoc();
8656
8657  // CF and OF aren't always set the way we want. Determine which
8658  // of these we need.
8659  bool NeedCF = false;
8660  bool NeedOF = false;
8661  switch (X86CC) {
8662  default: break;
8663  case X86::COND_A: case X86::COND_AE:
8664  case X86::COND_B: case X86::COND_BE:
8665    NeedCF = true;
8666    break;
8667  case X86::COND_G: case X86::COND_GE:
8668  case X86::COND_L: case X86::COND_LE:
8669  case X86::COND_O: case X86::COND_NO:
8670    NeedOF = true;
8671    break;
8672  }
8673
8674  // See if we can use the EFLAGS value from the operand instead of
8675  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
8676  // we prove that the arithmetic won't overflow, we can't use OF or CF.
8677  if (Op.getResNo() != 0 || NeedOF || NeedCF)
8678    // Emit a CMP with 0, which is the TEST pattern.
8679    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8680                       DAG.getConstant(0, Op.getValueType()));
8681
8682  unsigned Opcode = 0;
8683  unsigned NumOperands = 0;
8684
8685  // Truncate operations may prevent the merge of the SETCC instruction
8686  // and the arithmetic intruction before it. Attempt to truncate the operands
8687  // of the arithmetic instruction and use a reduced bit-width instruction.
8688  bool NeedTruncation = false;
8689  SDValue ArithOp = Op;
8690  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
8691    SDValue Arith = Op->getOperand(0);
8692    // Both the trunc and the arithmetic op need to have one user each.
8693    if (Arith->hasOneUse())
8694      switch (Arith.getOpcode()) {
8695        default: break;
8696        case ISD::ADD:
8697        case ISD::SUB:
8698        case ISD::AND:
8699        case ISD::OR:
8700        case ISD::XOR: {
8701          NeedTruncation = true;
8702          ArithOp = Arith;
8703        }
8704      }
8705  }
8706
8707  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
8708  // which may be the result of a CAST.  We use the variable 'Op', which is the
8709  // non-casted variable when we check for possible users.
8710  switch (ArithOp.getOpcode()) {
8711  case ISD::ADD:
8712    // Due to an isel shortcoming, be conservative if this add is likely to be
8713    // selected as part of a load-modify-store instruction. When the root node
8714    // in a match is a store, isel doesn't know how to remap non-chain non-flag
8715    // uses of other nodes in the match, such as the ADD in this case. This
8716    // leads to the ADD being left around and reselected, with the result being
8717    // two adds in the output.  Alas, even if none our users are stores, that
8718    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
8719    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
8720    // climbing the DAG back to the root, and it doesn't seem to be worth the
8721    // effort.
8722    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8723         UE = Op.getNode()->use_end(); UI != UE; ++UI)
8724      if (UI->getOpcode() != ISD::CopyToReg &&
8725          UI->getOpcode() != ISD::SETCC &&
8726          UI->getOpcode() != ISD::STORE)
8727        goto default_case;
8728
8729    if (ConstantSDNode *C =
8730        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
8731      // An add of one will be selected as an INC.
8732      if (C->getAPIntValue() == 1) {
8733        Opcode = X86ISD::INC;
8734        NumOperands = 1;
8735        break;
8736      }
8737
8738      // An add of negative one (subtract of one) will be selected as a DEC.
8739      if (C->getAPIntValue().isAllOnesValue()) {
8740        Opcode = X86ISD::DEC;
8741        NumOperands = 1;
8742        break;
8743      }
8744    }
8745
8746    // Otherwise use a regular EFLAGS-setting add.
8747    Opcode = X86ISD::ADD;
8748    NumOperands = 2;
8749    break;
8750  case ISD::AND: {
8751    // If the primary and result isn't used, don't bother using X86ISD::AND,
8752    // because a TEST instruction will be better.
8753    bool NonFlagUse = false;
8754    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8755           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
8756      SDNode *User = *UI;
8757      unsigned UOpNo = UI.getOperandNo();
8758      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
8759        // Look pass truncate.
8760        UOpNo = User->use_begin().getOperandNo();
8761        User = *User->use_begin();
8762      }
8763
8764      if (User->getOpcode() != ISD::BRCOND &&
8765          User->getOpcode() != ISD::SETCC &&
8766          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
8767        NonFlagUse = true;
8768        break;
8769      }
8770    }
8771
8772    if (!NonFlagUse)
8773      break;
8774  }
8775    // FALL THROUGH
8776  case ISD::SUB:
8777  case ISD::OR:
8778  case ISD::XOR:
8779    // Due to the ISEL shortcoming noted above, be conservative if this op is
8780    // likely to be selected as part of a load-modify-store instruction.
8781    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8782           UE = Op.getNode()->use_end(); UI != UE; ++UI)
8783      if (UI->getOpcode() == ISD::STORE)
8784        goto default_case;
8785
8786    // Otherwise use a regular EFLAGS-setting instruction.
8787    switch (ArithOp.getOpcode()) {
8788    default: llvm_unreachable("unexpected operator!");
8789    case ISD::SUB: Opcode = X86ISD::SUB; break;
8790    case ISD::XOR: Opcode = X86ISD::XOR; break;
8791    case ISD::AND: Opcode = X86ISD::AND; break;
8792    case ISD::OR: {
8793      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
8794        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
8795        if (EFLAGS.getNode())
8796          return EFLAGS;
8797      }
8798      Opcode = X86ISD::OR;
8799      break;
8800    }
8801    }
8802
8803    NumOperands = 2;
8804    break;
8805  case X86ISD::ADD:
8806  case X86ISD::SUB:
8807  case X86ISD::INC:
8808  case X86ISD::DEC:
8809  case X86ISD::OR:
8810  case X86ISD::XOR:
8811  case X86ISD::AND:
8812    return SDValue(Op.getNode(), 1);
8813  default:
8814  default_case:
8815    break;
8816  }
8817
8818  // If we found that truncation is beneficial, perform the truncation and
8819  // update 'Op'.
8820  if (NeedTruncation) {
8821    EVT VT = Op.getValueType();
8822    SDValue WideVal = Op->getOperand(0);
8823    EVT WideVT = WideVal.getValueType();
8824    unsigned ConvertedOp = 0;
8825    // Use a target machine opcode to prevent further DAGCombine
8826    // optimizations that may separate the arithmetic operations
8827    // from the setcc node.
8828    switch (WideVal.getOpcode()) {
8829      default: break;
8830      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
8831      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
8832      case ISD::AND: ConvertedOp = X86ISD::AND; break;
8833      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
8834      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
8835    }
8836
8837    if (ConvertedOp) {
8838      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8839      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
8840        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
8841        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
8842        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
8843      }
8844    }
8845  }
8846
8847  if (Opcode == 0)
8848    // Emit a CMP with 0, which is the TEST pattern.
8849    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8850                       DAG.getConstant(0, Op.getValueType()));
8851
8852  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
8853  SmallVector<SDValue, 4> Ops;
8854  for (unsigned i = 0; i != NumOperands; ++i)
8855    Ops.push_back(Op.getOperand(i));
8856
8857  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
8858  DAG.ReplaceAllUsesWith(Op, New);
8859  return SDValue(New.getNode(), 1);
8860}
8861
8862/// Emit nodes that will be selected as "cmp Op0,Op1", or something
8863/// equivalent.
8864SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
8865                                   SelectionDAG &DAG) const {
8866  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
8867    if (C->getAPIntValue() == 0)
8868      return EmitTest(Op0, X86CC, DAG);
8869
8870  DebugLoc dl = Op0.getDebugLoc();
8871  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
8872       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
8873    // Use SUB instead of CMP to enable CSE between SUB and CMP.
8874    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
8875    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
8876                              Op0, Op1);
8877    return SDValue(Sub.getNode(), 1);
8878  }
8879  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
8880}
8881
8882/// Convert a comparison if required by the subtarget.
8883SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
8884                                                 SelectionDAG &DAG) const {
8885  // If the subtarget does not support the FUCOMI instruction, floating-point
8886  // comparisons have to be converted.
8887  if (Subtarget->hasCMov() ||
8888      Cmp.getOpcode() != X86ISD::CMP ||
8889      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
8890      !Cmp.getOperand(1).getValueType().isFloatingPoint())
8891    return Cmp;
8892
8893  // The instruction selector will select an FUCOM instruction instead of
8894  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
8895  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
8896  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
8897  DebugLoc dl = Cmp.getDebugLoc();
8898  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
8899  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
8900  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
8901                            DAG.getConstant(8, MVT::i8));
8902  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
8903  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
8904}
8905
8906static bool isAllOnes(SDValue V) {
8907  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8908  return C && C->isAllOnesValue();
8909}
8910
8911/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
8912/// if it's possible.
8913SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
8914                                     DebugLoc dl, SelectionDAG &DAG) const {
8915  SDValue Op0 = And.getOperand(0);
8916  SDValue Op1 = And.getOperand(1);
8917  if (Op0.getOpcode() == ISD::TRUNCATE)
8918    Op0 = Op0.getOperand(0);
8919  if (Op1.getOpcode() == ISD::TRUNCATE)
8920    Op1 = Op1.getOperand(0);
8921
8922  SDValue LHS, RHS;
8923  if (Op1.getOpcode() == ISD::SHL)
8924    std::swap(Op0, Op1);
8925  if (Op0.getOpcode() == ISD::SHL) {
8926    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
8927      if (And00C->getZExtValue() == 1) {
8928        // If we looked past a truncate, check that it's only truncating away
8929        // known zeros.
8930        unsigned BitWidth = Op0.getValueSizeInBits();
8931        unsigned AndBitWidth = And.getValueSizeInBits();
8932        if (BitWidth > AndBitWidth) {
8933          APInt Zeros, Ones;
8934          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
8935          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
8936            return SDValue();
8937        }
8938        LHS = Op1;
8939        RHS = Op0.getOperand(1);
8940      }
8941  } else if (Op1.getOpcode() == ISD::Constant) {
8942    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
8943    uint64_t AndRHSVal = AndRHS->getZExtValue();
8944    SDValue AndLHS = Op0;
8945
8946    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
8947      LHS = AndLHS.getOperand(0);
8948      RHS = AndLHS.getOperand(1);
8949    }
8950
8951    // Use BT if the immediate can't be encoded in a TEST instruction.
8952    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
8953      LHS = AndLHS;
8954      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
8955    }
8956  }
8957
8958  if (LHS.getNode()) {
8959    // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip
8960    // the condition code later.
8961    bool Invert = false;
8962    if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) {
8963      Invert = true;
8964      LHS = LHS.getOperand(0);
8965    }
8966
8967    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
8968    // instruction.  Since the shift amount is in-range-or-undefined, we know
8969    // that doing a bittest on the i32 value is ok.  We extend to i32 because
8970    // the encoding for the i16 version is larger than the i32 version.
8971    // Also promote i16 to i32 for performance / code size reason.
8972    if (LHS.getValueType() == MVT::i8 ||
8973        LHS.getValueType() == MVT::i16)
8974      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
8975
8976    // If the operand types disagree, extend the shift amount to match.  Since
8977    // BT ignores high bits (like shifts) we can use anyextend.
8978    if (LHS.getValueType() != RHS.getValueType())
8979      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
8980
8981    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
8982    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
8983    // Flip the condition if the LHS was a not instruction
8984    if (Invert)
8985      Cond = X86::GetOppositeBranchCondition(Cond);
8986    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8987                       DAG.getConstant(Cond, MVT::i8), BT);
8988  }
8989
8990  return SDValue();
8991}
8992
8993SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8994
8995  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
8996
8997  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
8998  SDValue Op0 = Op.getOperand(0);
8999  SDValue Op1 = Op.getOperand(1);
9000  DebugLoc dl = Op.getDebugLoc();
9001  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
9002
9003  // Optimize to BT if possible.
9004  // Lower (X & (1 << N)) == 0 to BT(X, N).
9005  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
9006  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
9007  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
9008      Op1.getOpcode() == ISD::Constant &&
9009      cast<ConstantSDNode>(Op1)->isNullValue() &&
9010      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9011    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
9012    if (NewSetCC.getNode())
9013      return NewSetCC;
9014  }
9015
9016  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
9017  // these.
9018  if (Op1.getOpcode() == ISD::Constant &&
9019      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
9020       cast<ConstantSDNode>(Op1)->isNullValue()) &&
9021      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9022
9023    // If the input is a setcc, then reuse the input setcc or use a new one with
9024    // the inverted condition.
9025    if (Op0.getOpcode() == X86ISD::SETCC) {
9026      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
9027      bool Invert = (CC == ISD::SETNE) ^
9028        cast<ConstantSDNode>(Op1)->isNullValue();
9029      if (!Invert) return Op0;
9030
9031      CCode = X86::GetOppositeBranchCondition(CCode);
9032      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9033                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
9034    }
9035  }
9036
9037  bool isFP = Op1.getValueType().isFloatingPoint();
9038  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
9039  if (X86CC == X86::COND_INVALID)
9040    return SDValue();
9041
9042  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
9043  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
9044  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9045                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
9046}
9047
9048// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
9049// ones, and then concatenate the result back.
9050static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
9051  EVT VT = Op.getValueType();
9052
9053  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
9054         "Unsupported value type for operation");
9055
9056  unsigned NumElems = VT.getVectorNumElements();
9057  DebugLoc dl = Op.getDebugLoc();
9058  SDValue CC = Op.getOperand(2);
9059
9060  // Extract the LHS vectors
9061  SDValue LHS = Op.getOperand(0);
9062  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
9063  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
9064
9065  // Extract the RHS vectors
9066  SDValue RHS = Op.getOperand(1);
9067  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
9068  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
9069
9070  // Issue the operation on the smaller types and concatenate the result back
9071  MVT EltVT = VT.getVectorElementType().getSimpleVT();
9072  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
9073  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
9074                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
9075                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
9076}
9077
9078
9079SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
9080  SDValue Cond;
9081  SDValue Op0 = Op.getOperand(0);
9082  SDValue Op1 = Op.getOperand(1);
9083  SDValue CC = Op.getOperand(2);
9084  EVT VT = Op.getValueType();
9085  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9086  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
9087  DebugLoc dl = Op.getDebugLoc();
9088
9089  if (isFP) {
9090#ifndef NDEBUG
9091    EVT EltVT = Op0.getValueType().getVectorElementType();
9092    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
9093#endif
9094
9095    unsigned SSECC;
9096    bool Swap = false;
9097
9098    // SSE Condition code mapping:
9099    //  0 - EQ
9100    //  1 - LT
9101    //  2 - LE
9102    //  3 - UNORD
9103    //  4 - NEQ
9104    //  5 - NLT
9105    //  6 - NLE
9106    //  7 - ORD
9107    switch (SetCCOpcode) {
9108    default: llvm_unreachable("Unexpected SETCC condition");
9109    case ISD::SETOEQ:
9110    case ISD::SETEQ:  SSECC = 0; break;
9111    case ISD::SETOGT:
9112    case ISD::SETGT: Swap = true; // Fallthrough
9113    case ISD::SETLT:
9114    case ISD::SETOLT: SSECC = 1; break;
9115    case ISD::SETOGE:
9116    case ISD::SETGE: Swap = true; // Fallthrough
9117    case ISD::SETLE:
9118    case ISD::SETOLE: SSECC = 2; break;
9119    case ISD::SETUO:  SSECC = 3; break;
9120    case ISD::SETUNE:
9121    case ISD::SETNE:  SSECC = 4; break;
9122    case ISD::SETULE: Swap = true; // Fallthrough
9123    case ISD::SETUGE: SSECC = 5; break;
9124    case ISD::SETULT: Swap = true; // Fallthrough
9125    case ISD::SETUGT: SSECC = 6; break;
9126    case ISD::SETO:   SSECC = 7; break;
9127    case ISD::SETUEQ:
9128    case ISD::SETONE: SSECC = 8; break;
9129    }
9130    if (Swap)
9131      std::swap(Op0, Op1);
9132
9133    // In the two special cases we can't handle, emit two comparisons.
9134    if (SSECC == 8) {
9135      unsigned CC0, CC1;
9136      unsigned CombineOpc;
9137      if (SetCCOpcode == ISD::SETUEQ) {
9138        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
9139      } else {
9140        assert(SetCCOpcode == ISD::SETONE);
9141        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
9142      }
9143
9144      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9145                                 DAG.getConstant(CC0, MVT::i8));
9146      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9147                                 DAG.getConstant(CC1, MVT::i8));
9148      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
9149    }
9150    // Handle all other FP comparisons here.
9151    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9152                       DAG.getConstant(SSECC, MVT::i8));
9153  }
9154
9155  // Break 256-bit integer vector compare into smaller ones.
9156  if (VT.is256BitVector() && !Subtarget->hasInt256())
9157    return Lower256IntVSETCC(Op, DAG);
9158
9159  // We are handling one of the integer comparisons here.  Since SSE only has
9160  // GT and EQ comparisons for integer, swapping operands and multiple
9161  // operations may be required for some comparisons.
9162  unsigned Opc;
9163  bool Swap = false, Invert = false, FlipSigns = false;
9164
9165  switch (SetCCOpcode) {
9166  default: llvm_unreachable("Unexpected SETCC condition");
9167  case ISD::SETNE:  Invert = true;
9168  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
9169  case ISD::SETLT:  Swap = true;
9170  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
9171  case ISD::SETGE:  Swap = true;
9172  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
9173  case ISD::SETULT: Swap = true;
9174  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
9175  case ISD::SETUGE: Swap = true;
9176  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
9177  }
9178  if (Swap)
9179    std::swap(Op0, Op1);
9180
9181  // Check that the operation in question is available (most are plain SSE2,
9182  // but PCMPGTQ and PCMPEQQ have different requirements).
9183  if (VT == MVT::v2i64) {
9184    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
9185      return SDValue();
9186    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
9187      return SDValue();
9188  }
9189
9190  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
9191  // bits of the inputs before performing those operations.
9192  if (FlipSigns) {
9193    EVT EltVT = VT.getVectorElementType();
9194    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
9195                                      EltVT);
9196    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
9197    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
9198                                    SignBits.size());
9199    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
9200    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
9201  }
9202
9203  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
9204
9205  // If the logical-not of the result is required, perform that now.
9206  if (Invert)
9207    Result = DAG.getNOT(dl, Result, VT);
9208
9209  return Result;
9210}
9211
9212// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
9213static bool isX86LogicalCmp(SDValue Op) {
9214  unsigned Opc = Op.getNode()->getOpcode();
9215  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
9216      Opc == X86ISD::SAHF)
9217    return true;
9218  if (Op.getResNo() == 1 &&
9219      (Opc == X86ISD::ADD ||
9220       Opc == X86ISD::SUB ||
9221       Opc == X86ISD::ADC ||
9222       Opc == X86ISD::SBB ||
9223       Opc == X86ISD::SMUL ||
9224       Opc == X86ISD::UMUL ||
9225       Opc == X86ISD::INC ||
9226       Opc == X86ISD::DEC ||
9227       Opc == X86ISD::OR ||
9228       Opc == X86ISD::XOR ||
9229       Opc == X86ISD::AND))
9230    return true;
9231
9232  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
9233    return true;
9234
9235  return false;
9236}
9237
9238static bool isZero(SDValue V) {
9239  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
9240  return C && C->isNullValue();
9241}
9242
9243static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
9244  if (V.getOpcode() != ISD::TRUNCATE)
9245    return false;
9246
9247  SDValue VOp0 = V.getOperand(0);
9248  unsigned InBits = VOp0.getValueSizeInBits();
9249  unsigned Bits = V.getValueSizeInBits();
9250  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
9251}
9252
9253SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
9254  bool addTest = true;
9255  SDValue Cond  = Op.getOperand(0);
9256  SDValue Op1 = Op.getOperand(1);
9257  SDValue Op2 = Op.getOperand(2);
9258  DebugLoc DL = Op.getDebugLoc();
9259  SDValue CC;
9260
9261  if (Cond.getOpcode() == ISD::SETCC) {
9262    SDValue NewCond = LowerSETCC(Cond, DAG);
9263    if (NewCond.getNode())
9264      Cond = NewCond;
9265  }
9266
9267  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
9268  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
9269  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
9270  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
9271  if (Cond.getOpcode() == X86ISD::SETCC &&
9272      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
9273      isZero(Cond.getOperand(1).getOperand(1))) {
9274    SDValue Cmp = Cond.getOperand(1);
9275
9276    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
9277
9278    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
9279        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
9280      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
9281
9282      SDValue CmpOp0 = Cmp.getOperand(0);
9283      // Apply further optimizations for special cases
9284      // (select (x != 0), -1, 0) -> neg & sbb
9285      // (select (x == 0), 0, -1) -> neg & sbb
9286      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
9287        if (YC->isNullValue() &&
9288            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
9289          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
9290          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
9291                                    DAG.getConstant(0, CmpOp0.getValueType()),
9292                                    CmpOp0);
9293          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9294                                    DAG.getConstant(X86::COND_B, MVT::i8),
9295                                    SDValue(Neg.getNode(), 1));
9296          return Res;
9297        }
9298
9299      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
9300                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
9301      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9302
9303      SDValue Res =   // Res = 0 or -1.
9304        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9305                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
9306
9307      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
9308        Res = DAG.getNOT(DL, Res, Res.getValueType());
9309
9310      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
9311      if (N2C == 0 || !N2C->isNullValue())
9312        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
9313      return Res;
9314    }
9315  }
9316
9317  // Look past (and (setcc_carry (cmp ...)), 1).
9318  if (Cond.getOpcode() == ISD::AND &&
9319      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9320    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9321    if (C && C->getAPIntValue() == 1)
9322      Cond = Cond.getOperand(0);
9323  }
9324
9325  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9326  // setting operand in place of the X86ISD::SETCC.
9327  unsigned CondOpcode = Cond.getOpcode();
9328  if (CondOpcode == X86ISD::SETCC ||
9329      CondOpcode == X86ISD::SETCC_CARRY) {
9330    CC = Cond.getOperand(0);
9331
9332    SDValue Cmp = Cond.getOperand(1);
9333    unsigned Opc = Cmp.getOpcode();
9334    EVT VT = Op.getValueType();
9335
9336    bool IllegalFPCMov = false;
9337    if (VT.isFloatingPoint() && !VT.isVector() &&
9338        !isScalarFPTypeInSSEReg(VT))  // FPStack?
9339      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
9340
9341    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
9342        Opc == X86ISD::BT) { // FIXME
9343      Cond = Cmp;
9344      addTest = false;
9345    }
9346  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9347             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9348             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9349              Cond.getOperand(0).getValueType() != MVT::i8)) {
9350    SDValue LHS = Cond.getOperand(0);
9351    SDValue RHS = Cond.getOperand(1);
9352    unsigned X86Opcode;
9353    unsigned X86Cond;
9354    SDVTList VTs;
9355    switch (CondOpcode) {
9356    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9357    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9358    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9359    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9360    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9361    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9362    default: llvm_unreachable("unexpected overflowing operator");
9363    }
9364    if (CondOpcode == ISD::UMULO)
9365      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9366                          MVT::i32);
9367    else
9368      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9369
9370    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
9371
9372    if (CondOpcode == ISD::UMULO)
9373      Cond = X86Op.getValue(2);
9374    else
9375      Cond = X86Op.getValue(1);
9376
9377    CC = DAG.getConstant(X86Cond, MVT::i8);
9378    addTest = false;
9379  }
9380
9381  if (addTest) {
9382    // Look pass the truncate if the high bits are known zero.
9383    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9384        Cond = Cond.getOperand(0);
9385
9386    // We know the result of AND is compared against zero. Try to match
9387    // it to BT.
9388    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9389      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
9390      if (NewSetCC.getNode()) {
9391        CC = NewSetCC.getOperand(0);
9392        Cond = NewSetCC.getOperand(1);
9393        addTest = false;
9394      }
9395    }
9396  }
9397
9398  if (addTest) {
9399    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9400    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9401  }
9402
9403  // a <  b ? -1 :  0 -> RES = ~setcc_carry
9404  // a <  b ?  0 : -1 -> RES = setcc_carry
9405  // a >= b ? -1 :  0 -> RES = setcc_carry
9406  // a >= b ?  0 : -1 -> RES = ~setcc_carry
9407  if (Cond.getOpcode() == X86ISD::SUB) {
9408    Cond = ConvertCmpIfNecessary(Cond, DAG);
9409    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
9410
9411    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
9412        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
9413      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9414                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
9415      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
9416        return DAG.getNOT(DL, Res, Res.getValueType());
9417      return Res;
9418    }
9419  }
9420
9421  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
9422  // widen the cmov and push the truncate through. This avoids introducing a new
9423  // branch during isel and doesn't add any extensions.
9424  if (Op.getValueType() == MVT::i8 &&
9425      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
9426    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
9427    if (T1.getValueType() == T2.getValueType() &&
9428        // Blacklist CopyFromReg to avoid partial register stalls.
9429        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
9430      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
9431      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
9432      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
9433    }
9434  }
9435
9436  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
9437  // condition is true.
9438  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
9439  SDValue Ops[] = { Op2, Op1, CC, Cond };
9440  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
9441}
9442
9443// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
9444// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
9445// from the AND / OR.
9446static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
9447  Opc = Op.getOpcode();
9448  if (Opc != ISD::OR && Opc != ISD::AND)
9449    return false;
9450  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9451          Op.getOperand(0).hasOneUse() &&
9452          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
9453          Op.getOperand(1).hasOneUse());
9454}
9455
9456// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
9457// 1 and that the SETCC node has a single use.
9458static bool isXor1OfSetCC(SDValue Op) {
9459  if (Op.getOpcode() != ISD::XOR)
9460    return false;
9461  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9462  if (N1C && N1C->getAPIntValue() == 1) {
9463    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9464      Op.getOperand(0).hasOneUse();
9465  }
9466  return false;
9467}
9468
9469SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
9470  bool addTest = true;
9471  SDValue Chain = Op.getOperand(0);
9472  SDValue Cond  = Op.getOperand(1);
9473  SDValue Dest  = Op.getOperand(2);
9474  DebugLoc dl = Op.getDebugLoc();
9475  SDValue CC;
9476  bool Inverted = false;
9477
9478  if (Cond.getOpcode() == ISD::SETCC) {
9479    // Check for setcc([su]{add,sub,mul}o == 0).
9480    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
9481        isa<ConstantSDNode>(Cond.getOperand(1)) &&
9482        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
9483        Cond.getOperand(0).getResNo() == 1 &&
9484        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
9485         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
9486         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
9487         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
9488         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
9489         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
9490      Inverted = true;
9491      Cond = Cond.getOperand(0);
9492    } else {
9493      SDValue NewCond = LowerSETCC(Cond, DAG);
9494      if (NewCond.getNode())
9495        Cond = NewCond;
9496    }
9497  }
9498#if 0
9499  // FIXME: LowerXALUO doesn't handle these!!
9500  else if (Cond.getOpcode() == X86ISD::ADD  ||
9501           Cond.getOpcode() == X86ISD::SUB  ||
9502           Cond.getOpcode() == X86ISD::SMUL ||
9503           Cond.getOpcode() == X86ISD::UMUL)
9504    Cond = LowerXALUO(Cond, DAG);
9505#endif
9506
9507  // Look pass (and (setcc_carry (cmp ...)), 1).
9508  if (Cond.getOpcode() == ISD::AND &&
9509      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9510    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9511    if (C && C->getAPIntValue() == 1)
9512      Cond = Cond.getOperand(0);
9513  }
9514
9515  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9516  // setting operand in place of the X86ISD::SETCC.
9517  unsigned CondOpcode = Cond.getOpcode();
9518  if (CondOpcode == X86ISD::SETCC ||
9519      CondOpcode == X86ISD::SETCC_CARRY) {
9520    CC = Cond.getOperand(0);
9521
9522    SDValue Cmp = Cond.getOperand(1);
9523    unsigned Opc = Cmp.getOpcode();
9524    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
9525    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
9526      Cond = Cmp;
9527      addTest = false;
9528    } else {
9529      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
9530      default: break;
9531      case X86::COND_O:
9532      case X86::COND_B:
9533        // These can only come from an arithmetic instruction with overflow,
9534        // e.g. SADDO, UADDO.
9535        Cond = Cond.getNode()->getOperand(1);
9536        addTest = false;
9537        break;
9538      }
9539    }
9540  }
9541  CondOpcode = Cond.getOpcode();
9542  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9543      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9544      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9545       Cond.getOperand(0).getValueType() != MVT::i8)) {
9546    SDValue LHS = Cond.getOperand(0);
9547    SDValue RHS = Cond.getOperand(1);
9548    unsigned X86Opcode;
9549    unsigned X86Cond;
9550    SDVTList VTs;
9551    switch (CondOpcode) {
9552    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9553    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9554    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9555    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9556    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9557    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9558    default: llvm_unreachable("unexpected overflowing operator");
9559    }
9560    if (Inverted)
9561      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
9562    if (CondOpcode == ISD::UMULO)
9563      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9564                          MVT::i32);
9565    else
9566      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9567
9568    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
9569
9570    if (CondOpcode == ISD::UMULO)
9571      Cond = X86Op.getValue(2);
9572    else
9573      Cond = X86Op.getValue(1);
9574
9575    CC = DAG.getConstant(X86Cond, MVT::i8);
9576    addTest = false;
9577  } else {
9578    unsigned CondOpc;
9579    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
9580      SDValue Cmp = Cond.getOperand(0).getOperand(1);
9581      if (CondOpc == ISD::OR) {
9582        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
9583        // two branches instead of an explicit OR instruction with a
9584        // separate test.
9585        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9586            isX86LogicalCmp(Cmp)) {
9587          CC = Cond.getOperand(0).getOperand(0);
9588          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9589                              Chain, Dest, CC, Cmp);
9590          CC = Cond.getOperand(1).getOperand(0);
9591          Cond = Cmp;
9592          addTest = false;
9593        }
9594      } else { // ISD::AND
9595        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
9596        // two branches instead of an explicit AND instruction with a
9597        // separate test. However, we only do this if this block doesn't
9598        // have a fall-through edge, because this requires an explicit
9599        // jmp when the condition is false.
9600        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9601            isX86LogicalCmp(Cmp) &&
9602            Op.getNode()->hasOneUse()) {
9603          X86::CondCode CCode =
9604            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9605          CCode = X86::GetOppositeBranchCondition(CCode);
9606          CC = DAG.getConstant(CCode, MVT::i8);
9607          SDNode *User = *Op.getNode()->use_begin();
9608          // Look for an unconditional branch following this conditional branch.
9609          // We need this because we need to reverse the successors in order
9610          // to implement FCMP_OEQ.
9611          if (User->getOpcode() == ISD::BR) {
9612            SDValue FalseBB = User->getOperand(1);
9613            SDNode *NewBR =
9614              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9615            assert(NewBR == User);
9616            (void)NewBR;
9617            Dest = FalseBB;
9618
9619            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9620                                Chain, Dest, CC, Cmp);
9621            X86::CondCode CCode =
9622              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
9623            CCode = X86::GetOppositeBranchCondition(CCode);
9624            CC = DAG.getConstant(CCode, MVT::i8);
9625            Cond = Cmp;
9626            addTest = false;
9627          }
9628        }
9629      }
9630    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
9631      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
9632      // It should be transformed during dag combiner except when the condition
9633      // is set by a arithmetics with overflow node.
9634      X86::CondCode CCode =
9635        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9636      CCode = X86::GetOppositeBranchCondition(CCode);
9637      CC = DAG.getConstant(CCode, MVT::i8);
9638      Cond = Cond.getOperand(0).getOperand(1);
9639      addTest = false;
9640    } else if (Cond.getOpcode() == ISD::SETCC &&
9641               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
9642      // For FCMP_OEQ, we can emit
9643      // two branches instead of an explicit AND instruction with a
9644      // separate test. However, we only do this if this block doesn't
9645      // have a fall-through edge, because this requires an explicit
9646      // jmp when the condition is false.
9647      if (Op.getNode()->hasOneUse()) {
9648        SDNode *User = *Op.getNode()->use_begin();
9649        // Look for an unconditional branch following this conditional branch.
9650        // We need this because we need to reverse the successors in order
9651        // to implement FCMP_OEQ.
9652        if (User->getOpcode() == ISD::BR) {
9653          SDValue FalseBB = User->getOperand(1);
9654          SDNode *NewBR =
9655            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9656          assert(NewBR == User);
9657          (void)NewBR;
9658          Dest = FalseBB;
9659
9660          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9661                                    Cond.getOperand(0), Cond.getOperand(1));
9662          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9663          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9664          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9665                              Chain, Dest, CC, Cmp);
9666          CC = DAG.getConstant(X86::COND_P, MVT::i8);
9667          Cond = Cmp;
9668          addTest = false;
9669        }
9670      }
9671    } else if (Cond.getOpcode() == ISD::SETCC &&
9672               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
9673      // For FCMP_UNE, we can emit
9674      // two branches instead of an explicit AND instruction with a
9675      // separate test. However, we only do this if this block doesn't
9676      // have a fall-through edge, because this requires an explicit
9677      // jmp when the condition is false.
9678      if (Op.getNode()->hasOneUse()) {
9679        SDNode *User = *Op.getNode()->use_begin();
9680        // Look for an unconditional branch following this conditional branch.
9681        // We need this because we need to reverse the successors in order
9682        // to implement FCMP_UNE.
9683        if (User->getOpcode() == ISD::BR) {
9684          SDValue FalseBB = User->getOperand(1);
9685          SDNode *NewBR =
9686            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9687          assert(NewBR == User);
9688          (void)NewBR;
9689
9690          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9691                                    Cond.getOperand(0), Cond.getOperand(1));
9692          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9693          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9694          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9695                              Chain, Dest, CC, Cmp);
9696          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
9697          Cond = Cmp;
9698          addTest = false;
9699          Dest = FalseBB;
9700        }
9701      }
9702    }
9703  }
9704
9705  if (addTest) {
9706    // Look pass the truncate if the high bits are known zero.
9707    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9708        Cond = Cond.getOperand(0);
9709
9710    // We know the result of AND is compared against zero. Try to match
9711    // it to BT.
9712    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9713      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
9714      if (NewSetCC.getNode()) {
9715        CC = NewSetCC.getOperand(0);
9716        Cond = NewSetCC.getOperand(1);
9717        addTest = false;
9718      }
9719    }
9720  }
9721
9722  if (addTest) {
9723    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9724    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9725  }
9726  Cond = ConvertCmpIfNecessary(Cond, DAG);
9727  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9728                     Chain, Dest, CC, Cond);
9729}
9730
9731
9732// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
9733// Calls to _alloca is needed to probe the stack when allocating more than 4k
9734// bytes in one go. Touching the stack at 4K increments is necessary to ensure
9735// that the guard pages used by the OS virtual memory manager are allocated in
9736// correct sequence.
9737SDValue
9738X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
9739                                           SelectionDAG &DAG) const {
9740  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
9741          getTargetMachine().Options.EnableSegmentedStacks) &&
9742         "This should be used only on Windows targets or when segmented stacks "
9743         "are being used");
9744  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
9745  DebugLoc dl = Op.getDebugLoc();
9746
9747  // Get the inputs.
9748  SDValue Chain = Op.getOperand(0);
9749  SDValue Size  = Op.getOperand(1);
9750  // FIXME: Ensure alignment here
9751
9752  bool Is64Bit = Subtarget->is64Bit();
9753  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
9754
9755  if (getTargetMachine().Options.EnableSegmentedStacks) {
9756    MachineFunction &MF = DAG.getMachineFunction();
9757    MachineRegisterInfo &MRI = MF.getRegInfo();
9758
9759    if (Is64Bit) {
9760      // The 64 bit implementation of segmented stacks needs to clobber both r10
9761      // r11. This makes it impossible to use it along with nested parameters.
9762      const Function *F = MF.getFunction();
9763
9764      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
9765           I != E; ++I)
9766        if (I->hasNestAttr())
9767          report_fatal_error("Cannot use segmented stacks with functions that "
9768                             "have nested arguments.");
9769    }
9770
9771    const TargetRegisterClass *AddrRegClass =
9772      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
9773    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
9774    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
9775    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
9776                                DAG.getRegister(Vreg, SPTy));
9777    SDValue Ops1[2] = { Value, Chain };
9778    return DAG.getMergeValues(Ops1, 2, dl);
9779  } else {
9780    SDValue Flag;
9781    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
9782
9783    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
9784    Flag = Chain.getValue(1);
9785    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9786
9787    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
9788    Flag = Chain.getValue(1);
9789
9790    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
9791                               SPTy).getValue(1);
9792
9793    SDValue Ops1[2] = { Chain.getValue(0), Chain };
9794    return DAG.getMergeValues(Ops1, 2, dl);
9795  }
9796}
9797
9798SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
9799  MachineFunction &MF = DAG.getMachineFunction();
9800  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
9801
9802  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9803  DebugLoc DL = Op.getDebugLoc();
9804
9805  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
9806    // vastart just stores the address of the VarArgsFrameIndex slot into the
9807    // memory location argument.
9808    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9809                                   getPointerTy());
9810    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9811                        MachinePointerInfo(SV), false, false, 0);
9812  }
9813
9814  // __va_list_tag:
9815  //   gp_offset         (0 - 6 * 8)
9816  //   fp_offset         (48 - 48 + 8 * 16)
9817  //   overflow_arg_area (point to parameters coming in memory).
9818  //   reg_save_area
9819  SmallVector<SDValue, 8> MemOps;
9820  SDValue FIN = Op.getOperand(1);
9821  // Store gp_offset
9822  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
9823                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
9824                                               MVT::i32),
9825                               FIN, MachinePointerInfo(SV), false, false, 0);
9826  MemOps.push_back(Store);
9827
9828  // Store fp_offset
9829  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9830                    FIN, DAG.getIntPtrConstant(4));
9831  Store = DAG.getStore(Op.getOperand(0), DL,
9832                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
9833                                       MVT::i32),
9834                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
9835  MemOps.push_back(Store);
9836
9837  // Store ptr to overflow_arg_area
9838  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9839                    FIN, DAG.getIntPtrConstant(4));
9840  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9841                                    getPointerTy());
9842  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
9843                       MachinePointerInfo(SV, 8),
9844                       false, false, 0);
9845  MemOps.push_back(Store);
9846
9847  // Store ptr to reg_save_area.
9848  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9849                    FIN, DAG.getIntPtrConstant(8));
9850  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
9851                                    getPointerTy());
9852  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
9853                       MachinePointerInfo(SV, 16), false, false, 0);
9854  MemOps.push_back(Store);
9855  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9856                     &MemOps[0], MemOps.size());
9857}
9858
9859SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9860  assert(Subtarget->is64Bit() &&
9861         "LowerVAARG only handles 64-bit va_arg!");
9862  assert((Subtarget->isTargetLinux() ||
9863          Subtarget->isTargetDarwin()) &&
9864          "Unhandled target in LowerVAARG");
9865  assert(Op.getNode()->getNumOperands() == 4);
9866  SDValue Chain = Op.getOperand(0);
9867  SDValue SrcPtr = Op.getOperand(1);
9868  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9869  unsigned Align = Op.getConstantOperandVal(3);
9870  DebugLoc dl = Op.getDebugLoc();
9871
9872  EVT ArgVT = Op.getNode()->getValueType(0);
9873  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9874  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
9875  uint8_t ArgMode;
9876
9877  // Decide which area this value should be read from.
9878  // TODO: Implement the AMD64 ABI in its entirety. This simple
9879  // selection mechanism works only for the basic types.
9880  if (ArgVT == MVT::f80) {
9881    llvm_unreachable("va_arg for f80 not yet implemented");
9882  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
9883    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
9884  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
9885    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
9886  } else {
9887    llvm_unreachable("Unhandled argument type in LowerVAARG");
9888  }
9889
9890  if (ArgMode == 2) {
9891    // Sanity Check: Make sure using fp_offset makes sense.
9892    assert(!getTargetMachine().Options.UseSoftFloat &&
9893           !(DAG.getMachineFunction()
9894                .getFunction()->getFnAttributes()
9895                .hasAttribute(Attributes::NoImplicitFloat)) &&
9896           Subtarget->hasSSE1());
9897  }
9898
9899  // Insert VAARG_64 node into the DAG
9900  // VAARG_64 returns two values: Variable Argument Address, Chain
9901  SmallVector<SDValue, 11> InstOps;
9902  InstOps.push_back(Chain);
9903  InstOps.push_back(SrcPtr);
9904  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
9905  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
9906  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
9907  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
9908  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
9909                                          VTs, &InstOps[0], InstOps.size(),
9910                                          MVT::i64,
9911                                          MachinePointerInfo(SV),
9912                                          /*Align=*/0,
9913                                          /*Volatile=*/false,
9914                                          /*ReadMem=*/true,
9915                                          /*WriteMem=*/true);
9916  Chain = VAARG.getValue(1);
9917
9918  // Load the next argument and return it
9919  return DAG.getLoad(ArgVT, dl,
9920                     Chain,
9921                     VAARG,
9922                     MachinePointerInfo(),
9923                     false, false, false, 0);
9924}
9925
9926static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
9927                           SelectionDAG &DAG) {
9928  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
9929  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
9930  SDValue Chain = Op.getOperand(0);
9931  SDValue DstPtr = Op.getOperand(1);
9932  SDValue SrcPtr = Op.getOperand(2);
9933  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
9934  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9935  DebugLoc DL = Op.getDebugLoc();
9936
9937  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
9938                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
9939                       false,
9940                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
9941}
9942
9943// getTargetVShiftNOde - Handle vector element shifts where the shift amount
9944// may or may not be a constant. Takes immediate version of shift as input.
9945static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
9946                                   SDValue SrcOp, SDValue ShAmt,
9947                                   SelectionDAG &DAG) {
9948  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
9949
9950  if (isa<ConstantSDNode>(ShAmt)) {
9951    // Constant may be a TargetConstant. Use a regular constant.
9952    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
9953    switch (Opc) {
9954      default: llvm_unreachable("Unknown target vector shift node");
9955      case X86ISD::VSHLI:
9956      case X86ISD::VSRLI:
9957      case X86ISD::VSRAI:
9958        return DAG.getNode(Opc, dl, VT, SrcOp,
9959                           DAG.getConstant(ShiftAmt, MVT::i32));
9960    }
9961  }
9962
9963  // Change opcode to non-immediate version
9964  switch (Opc) {
9965    default: llvm_unreachable("Unknown target vector shift node");
9966    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
9967    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
9968    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
9969  }
9970
9971  // Need to build a vector containing shift amount
9972  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
9973  SDValue ShOps[4];
9974  ShOps[0] = ShAmt;
9975  ShOps[1] = DAG.getConstant(0, MVT::i32);
9976  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
9977  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
9978
9979  // The return type has to be a 128-bit type with the same element
9980  // type as the input type.
9981  MVT EltVT = VT.getVectorElementType().getSimpleVT();
9982  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
9983
9984  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
9985  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
9986}
9987
9988static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
9989  DebugLoc dl = Op.getDebugLoc();
9990  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9991  switch (IntNo) {
9992  default: return SDValue();    // Don't custom lower most intrinsics.
9993  // Comparison intrinsics.
9994  case Intrinsic::x86_sse_comieq_ss:
9995  case Intrinsic::x86_sse_comilt_ss:
9996  case Intrinsic::x86_sse_comile_ss:
9997  case Intrinsic::x86_sse_comigt_ss:
9998  case Intrinsic::x86_sse_comige_ss:
9999  case Intrinsic::x86_sse_comineq_ss:
10000  case Intrinsic::x86_sse_ucomieq_ss:
10001  case Intrinsic::x86_sse_ucomilt_ss:
10002  case Intrinsic::x86_sse_ucomile_ss:
10003  case Intrinsic::x86_sse_ucomigt_ss:
10004  case Intrinsic::x86_sse_ucomige_ss:
10005  case Intrinsic::x86_sse_ucomineq_ss:
10006  case Intrinsic::x86_sse2_comieq_sd:
10007  case Intrinsic::x86_sse2_comilt_sd:
10008  case Intrinsic::x86_sse2_comile_sd:
10009  case Intrinsic::x86_sse2_comigt_sd:
10010  case Intrinsic::x86_sse2_comige_sd:
10011  case Intrinsic::x86_sse2_comineq_sd:
10012  case Intrinsic::x86_sse2_ucomieq_sd:
10013  case Intrinsic::x86_sse2_ucomilt_sd:
10014  case Intrinsic::x86_sse2_ucomile_sd:
10015  case Intrinsic::x86_sse2_ucomigt_sd:
10016  case Intrinsic::x86_sse2_ucomige_sd:
10017  case Intrinsic::x86_sse2_ucomineq_sd: {
10018    unsigned Opc;
10019    ISD::CondCode CC;
10020    switch (IntNo) {
10021    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10022    case Intrinsic::x86_sse_comieq_ss:
10023    case Intrinsic::x86_sse2_comieq_sd:
10024      Opc = X86ISD::COMI;
10025      CC = ISD::SETEQ;
10026      break;
10027    case Intrinsic::x86_sse_comilt_ss:
10028    case Intrinsic::x86_sse2_comilt_sd:
10029      Opc = X86ISD::COMI;
10030      CC = ISD::SETLT;
10031      break;
10032    case Intrinsic::x86_sse_comile_ss:
10033    case Intrinsic::x86_sse2_comile_sd:
10034      Opc = X86ISD::COMI;
10035      CC = ISD::SETLE;
10036      break;
10037    case Intrinsic::x86_sse_comigt_ss:
10038    case Intrinsic::x86_sse2_comigt_sd:
10039      Opc = X86ISD::COMI;
10040      CC = ISD::SETGT;
10041      break;
10042    case Intrinsic::x86_sse_comige_ss:
10043    case Intrinsic::x86_sse2_comige_sd:
10044      Opc = X86ISD::COMI;
10045      CC = ISD::SETGE;
10046      break;
10047    case Intrinsic::x86_sse_comineq_ss:
10048    case Intrinsic::x86_sse2_comineq_sd:
10049      Opc = X86ISD::COMI;
10050      CC = ISD::SETNE;
10051      break;
10052    case Intrinsic::x86_sse_ucomieq_ss:
10053    case Intrinsic::x86_sse2_ucomieq_sd:
10054      Opc = X86ISD::UCOMI;
10055      CC = ISD::SETEQ;
10056      break;
10057    case Intrinsic::x86_sse_ucomilt_ss:
10058    case Intrinsic::x86_sse2_ucomilt_sd:
10059      Opc = X86ISD::UCOMI;
10060      CC = ISD::SETLT;
10061      break;
10062    case Intrinsic::x86_sse_ucomile_ss:
10063    case Intrinsic::x86_sse2_ucomile_sd:
10064      Opc = X86ISD::UCOMI;
10065      CC = ISD::SETLE;
10066      break;
10067    case Intrinsic::x86_sse_ucomigt_ss:
10068    case Intrinsic::x86_sse2_ucomigt_sd:
10069      Opc = X86ISD::UCOMI;
10070      CC = ISD::SETGT;
10071      break;
10072    case Intrinsic::x86_sse_ucomige_ss:
10073    case Intrinsic::x86_sse2_ucomige_sd:
10074      Opc = X86ISD::UCOMI;
10075      CC = ISD::SETGE;
10076      break;
10077    case Intrinsic::x86_sse_ucomineq_ss:
10078    case Intrinsic::x86_sse2_ucomineq_sd:
10079      Opc = X86ISD::UCOMI;
10080      CC = ISD::SETNE;
10081      break;
10082    }
10083
10084    SDValue LHS = Op.getOperand(1);
10085    SDValue RHS = Op.getOperand(2);
10086    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
10087    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
10088    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
10089    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10090                                DAG.getConstant(X86CC, MVT::i8), Cond);
10091    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10092  }
10093
10094  // Arithmetic intrinsics.
10095  case Intrinsic::x86_sse2_pmulu_dq:
10096  case Intrinsic::x86_avx2_pmulu_dq:
10097    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
10098                       Op.getOperand(1), Op.getOperand(2));
10099
10100  // SSE3/AVX horizontal add/sub intrinsics
10101  case Intrinsic::x86_sse3_hadd_ps:
10102  case Intrinsic::x86_sse3_hadd_pd:
10103  case Intrinsic::x86_avx_hadd_ps_256:
10104  case Intrinsic::x86_avx_hadd_pd_256:
10105  case Intrinsic::x86_sse3_hsub_ps:
10106  case Intrinsic::x86_sse3_hsub_pd:
10107  case Intrinsic::x86_avx_hsub_ps_256:
10108  case Intrinsic::x86_avx_hsub_pd_256:
10109  case Intrinsic::x86_ssse3_phadd_w_128:
10110  case Intrinsic::x86_ssse3_phadd_d_128:
10111  case Intrinsic::x86_avx2_phadd_w:
10112  case Intrinsic::x86_avx2_phadd_d:
10113  case Intrinsic::x86_ssse3_phsub_w_128:
10114  case Intrinsic::x86_ssse3_phsub_d_128:
10115  case Intrinsic::x86_avx2_phsub_w:
10116  case Intrinsic::x86_avx2_phsub_d: {
10117    unsigned Opcode;
10118    switch (IntNo) {
10119    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10120    case Intrinsic::x86_sse3_hadd_ps:
10121    case Intrinsic::x86_sse3_hadd_pd:
10122    case Intrinsic::x86_avx_hadd_ps_256:
10123    case Intrinsic::x86_avx_hadd_pd_256:
10124      Opcode = X86ISD::FHADD;
10125      break;
10126    case Intrinsic::x86_sse3_hsub_ps:
10127    case Intrinsic::x86_sse3_hsub_pd:
10128    case Intrinsic::x86_avx_hsub_ps_256:
10129    case Intrinsic::x86_avx_hsub_pd_256:
10130      Opcode = X86ISD::FHSUB;
10131      break;
10132    case Intrinsic::x86_ssse3_phadd_w_128:
10133    case Intrinsic::x86_ssse3_phadd_d_128:
10134    case Intrinsic::x86_avx2_phadd_w:
10135    case Intrinsic::x86_avx2_phadd_d:
10136      Opcode = X86ISD::HADD;
10137      break;
10138    case Intrinsic::x86_ssse3_phsub_w_128:
10139    case Intrinsic::x86_ssse3_phsub_d_128:
10140    case Intrinsic::x86_avx2_phsub_w:
10141    case Intrinsic::x86_avx2_phsub_d:
10142      Opcode = X86ISD::HSUB;
10143      break;
10144    }
10145    return DAG.getNode(Opcode, dl, Op.getValueType(),
10146                       Op.getOperand(1), Op.getOperand(2));
10147  }
10148
10149  // AVX2 variable shift intrinsics
10150  case Intrinsic::x86_avx2_psllv_d:
10151  case Intrinsic::x86_avx2_psllv_q:
10152  case Intrinsic::x86_avx2_psllv_d_256:
10153  case Intrinsic::x86_avx2_psllv_q_256:
10154  case Intrinsic::x86_avx2_psrlv_d:
10155  case Intrinsic::x86_avx2_psrlv_q:
10156  case Intrinsic::x86_avx2_psrlv_d_256:
10157  case Intrinsic::x86_avx2_psrlv_q_256:
10158  case Intrinsic::x86_avx2_psrav_d:
10159  case Intrinsic::x86_avx2_psrav_d_256: {
10160    unsigned Opcode;
10161    switch (IntNo) {
10162    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10163    case Intrinsic::x86_avx2_psllv_d:
10164    case Intrinsic::x86_avx2_psllv_q:
10165    case Intrinsic::x86_avx2_psllv_d_256:
10166    case Intrinsic::x86_avx2_psllv_q_256:
10167      Opcode = ISD::SHL;
10168      break;
10169    case Intrinsic::x86_avx2_psrlv_d:
10170    case Intrinsic::x86_avx2_psrlv_q:
10171    case Intrinsic::x86_avx2_psrlv_d_256:
10172    case Intrinsic::x86_avx2_psrlv_q_256:
10173      Opcode = ISD::SRL;
10174      break;
10175    case Intrinsic::x86_avx2_psrav_d:
10176    case Intrinsic::x86_avx2_psrav_d_256:
10177      Opcode = ISD::SRA;
10178      break;
10179    }
10180    return DAG.getNode(Opcode, dl, Op.getValueType(),
10181                       Op.getOperand(1), Op.getOperand(2));
10182  }
10183
10184  case Intrinsic::x86_ssse3_pshuf_b_128:
10185  case Intrinsic::x86_avx2_pshuf_b:
10186    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
10187                       Op.getOperand(1), Op.getOperand(2));
10188
10189  case Intrinsic::x86_ssse3_psign_b_128:
10190  case Intrinsic::x86_ssse3_psign_w_128:
10191  case Intrinsic::x86_ssse3_psign_d_128:
10192  case Intrinsic::x86_avx2_psign_b:
10193  case Intrinsic::x86_avx2_psign_w:
10194  case Intrinsic::x86_avx2_psign_d:
10195    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
10196                       Op.getOperand(1), Op.getOperand(2));
10197
10198  case Intrinsic::x86_sse41_insertps:
10199    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
10200                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
10201
10202  case Intrinsic::x86_avx_vperm2f128_ps_256:
10203  case Intrinsic::x86_avx_vperm2f128_pd_256:
10204  case Intrinsic::x86_avx_vperm2f128_si_256:
10205  case Intrinsic::x86_avx2_vperm2i128:
10206    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
10207                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
10208
10209  case Intrinsic::x86_avx2_permd:
10210  case Intrinsic::x86_avx2_permps:
10211    // Operands intentionally swapped. Mask is last operand to intrinsic,
10212    // but second operand for node/intruction.
10213    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
10214                       Op.getOperand(2), Op.getOperand(1));
10215
10216  // ptest and testp intrinsics. The intrinsic these come from are designed to
10217  // return an integer value, not just an instruction so lower it to the ptest
10218  // or testp pattern and a setcc for the result.
10219  case Intrinsic::x86_sse41_ptestz:
10220  case Intrinsic::x86_sse41_ptestc:
10221  case Intrinsic::x86_sse41_ptestnzc:
10222  case Intrinsic::x86_avx_ptestz_256:
10223  case Intrinsic::x86_avx_ptestc_256:
10224  case Intrinsic::x86_avx_ptestnzc_256:
10225  case Intrinsic::x86_avx_vtestz_ps:
10226  case Intrinsic::x86_avx_vtestc_ps:
10227  case Intrinsic::x86_avx_vtestnzc_ps:
10228  case Intrinsic::x86_avx_vtestz_pd:
10229  case Intrinsic::x86_avx_vtestc_pd:
10230  case Intrinsic::x86_avx_vtestnzc_pd:
10231  case Intrinsic::x86_avx_vtestz_ps_256:
10232  case Intrinsic::x86_avx_vtestc_ps_256:
10233  case Intrinsic::x86_avx_vtestnzc_ps_256:
10234  case Intrinsic::x86_avx_vtestz_pd_256:
10235  case Intrinsic::x86_avx_vtestc_pd_256:
10236  case Intrinsic::x86_avx_vtestnzc_pd_256: {
10237    bool IsTestPacked = false;
10238    unsigned X86CC;
10239    switch (IntNo) {
10240    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
10241    case Intrinsic::x86_avx_vtestz_ps:
10242    case Intrinsic::x86_avx_vtestz_pd:
10243    case Intrinsic::x86_avx_vtestz_ps_256:
10244    case Intrinsic::x86_avx_vtestz_pd_256:
10245      IsTestPacked = true; // Fallthrough
10246    case Intrinsic::x86_sse41_ptestz:
10247    case Intrinsic::x86_avx_ptestz_256:
10248      // ZF = 1
10249      X86CC = X86::COND_E;
10250      break;
10251    case Intrinsic::x86_avx_vtestc_ps:
10252    case Intrinsic::x86_avx_vtestc_pd:
10253    case Intrinsic::x86_avx_vtestc_ps_256:
10254    case Intrinsic::x86_avx_vtestc_pd_256:
10255      IsTestPacked = true; // Fallthrough
10256    case Intrinsic::x86_sse41_ptestc:
10257    case Intrinsic::x86_avx_ptestc_256:
10258      // CF = 1
10259      X86CC = X86::COND_B;
10260      break;
10261    case Intrinsic::x86_avx_vtestnzc_ps:
10262    case Intrinsic::x86_avx_vtestnzc_pd:
10263    case Intrinsic::x86_avx_vtestnzc_ps_256:
10264    case Intrinsic::x86_avx_vtestnzc_pd_256:
10265      IsTestPacked = true; // Fallthrough
10266    case Intrinsic::x86_sse41_ptestnzc:
10267    case Intrinsic::x86_avx_ptestnzc_256:
10268      // ZF and CF = 0
10269      X86CC = X86::COND_A;
10270      break;
10271    }
10272
10273    SDValue LHS = Op.getOperand(1);
10274    SDValue RHS = Op.getOperand(2);
10275    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
10276    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
10277    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
10278    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
10279    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10280  }
10281
10282  // SSE/AVX shift intrinsics
10283  case Intrinsic::x86_sse2_psll_w:
10284  case Intrinsic::x86_sse2_psll_d:
10285  case Intrinsic::x86_sse2_psll_q:
10286  case Intrinsic::x86_avx2_psll_w:
10287  case Intrinsic::x86_avx2_psll_d:
10288  case Intrinsic::x86_avx2_psll_q:
10289  case Intrinsic::x86_sse2_psrl_w:
10290  case Intrinsic::x86_sse2_psrl_d:
10291  case Intrinsic::x86_sse2_psrl_q:
10292  case Intrinsic::x86_avx2_psrl_w:
10293  case Intrinsic::x86_avx2_psrl_d:
10294  case Intrinsic::x86_avx2_psrl_q:
10295  case Intrinsic::x86_sse2_psra_w:
10296  case Intrinsic::x86_sse2_psra_d:
10297  case Intrinsic::x86_avx2_psra_w:
10298  case Intrinsic::x86_avx2_psra_d: {
10299    unsigned Opcode;
10300    switch (IntNo) {
10301    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10302    case Intrinsic::x86_sse2_psll_w:
10303    case Intrinsic::x86_sse2_psll_d:
10304    case Intrinsic::x86_sse2_psll_q:
10305    case Intrinsic::x86_avx2_psll_w:
10306    case Intrinsic::x86_avx2_psll_d:
10307    case Intrinsic::x86_avx2_psll_q:
10308      Opcode = X86ISD::VSHL;
10309      break;
10310    case Intrinsic::x86_sse2_psrl_w:
10311    case Intrinsic::x86_sse2_psrl_d:
10312    case Intrinsic::x86_sse2_psrl_q:
10313    case Intrinsic::x86_avx2_psrl_w:
10314    case Intrinsic::x86_avx2_psrl_d:
10315    case Intrinsic::x86_avx2_psrl_q:
10316      Opcode = X86ISD::VSRL;
10317      break;
10318    case Intrinsic::x86_sse2_psra_w:
10319    case Intrinsic::x86_sse2_psra_d:
10320    case Intrinsic::x86_avx2_psra_w:
10321    case Intrinsic::x86_avx2_psra_d:
10322      Opcode = X86ISD::VSRA;
10323      break;
10324    }
10325    return DAG.getNode(Opcode, dl, Op.getValueType(),
10326                       Op.getOperand(1), Op.getOperand(2));
10327  }
10328
10329  // SSE/AVX immediate shift intrinsics
10330  case Intrinsic::x86_sse2_pslli_w:
10331  case Intrinsic::x86_sse2_pslli_d:
10332  case Intrinsic::x86_sse2_pslli_q:
10333  case Intrinsic::x86_avx2_pslli_w:
10334  case Intrinsic::x86_avx2_pslli_d:
10335  case Intrinsic::x86_avx2_pslli_q:
10336  case Intrinsic::x86_sse2_psrli_w:
10337  case Intrinsic::x86_sse2_psrli_d:
10338  case Intrinsic::x86_sse2_psrli_q:
10339  case Intrinsic::x86_avx2_psrli_w:
10340  case Intrinsic::x86_avx2_psrli_d:
10341  case Intrinsic::x86_avx2_psrli_q:
10342  case Intrinsic::x86_sse2_psrai_w:
10343  case Intrinsic::x86_sse2_psrai_d:
10344  case Intrinsic::x86_avx2_psrai_w:
10345  case Intrinsic::x86_avx2_psrai_d: {
10346    unsigned Opcode;
10347    switch (IntNo) {
10348    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10349    case Intrinsic::x86_sse2_pslli_w:
10350    case Intrinsic::x86_sse2_pslli_d:
10351    case Intrinsic::x86_sse2_pslli_q:
10352    case Intrinsic::x86_avx2_pslli_w:
10353    case Intrinsic::x86_avx2_pslli_d:
10354    case Intrinsic::x86_avx2_pslli_q:
10355      Opcode = X86ISD::VSHLI;
10356      break;
10357    case Intrinsic::x86_sse2_psrli_w:
10358    case Intrinsic::x86_sse2_psrli_d:
10359    case Intrinsic::x86_sse2_psrli_q:
10360    case Intrinsic::x86_avx2_psrli_w:
10361    case Intrinsic::x86_avx2_psrli_d:
10362    case Intrinsic::x86_avx2_psrli_q:
10363      Opcode = X86ISD::VSRLI;
10364      break;
10365    case Intrinsic::x86_sse2_psrai_w:
10366    case Intrinsic::x86_sse2_psrai_d:
10367    case Intrinsic::x86_avx2_psrai_w:
10368    case Intrinsic::x86_avx2_psrai_d:
10369      Opcode = X86ISD::VSRAI;
10370      break;
10371    }
10372    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
10373                               Op.getOperand(1), Op.getOperand(2), DAG);
10374  }
10375
10376  case Intrinsic::x86_sse42_pcmpistria128:
10377  case Intrinsic::x86_sse42_pcmpestria128:
10378  case Intrinsic::x86_sse42_pcmpistric128:
10379  case Intrinsic::x86_sse42_pcmpestric128:
10380  case Intrinsic::x86_sse42_pcmpistrio128:
10381  case Intrinsic::x86_sse42_pcmpestrio128:
10382  case Intrinsic::x86_sse42_pcmpistris128:
10383  case Intrinsic::x86_sse42_pcmpestris128:
10384  case Intrinsic::x86_sse42_pcmpistriz128:
10385  case Intrinsic::x86_sse42_pcmpestriz128: {
10386    unsigned Opcode;
10387    unsigned X86CC;
10388    switch (IntNo) {
10389    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10390    case Intrinsic::x86_sse42_pcmpistria128:
10391      Opcode = X86ISD::PCMPISTRI;
10392      X86CC = X86::COND_A;
10393      break;
10394    case Intrinsic::x86_sse42_pcmpestria128:
10395      Opcode = X86ISD::PCMPESTRI;
10396      X86CC = X86::COND_A;
10397      break;
10398    case Intrinsic::x86_sse42_pcmpistric128:
10399      Opcode = X86ISD::PCMPISTRI;
10400      X86CC = X86::COND_B;
10401      break;
10402    case Intrinsic::x86_sse42_pcmpestric128:
10403      Opcode = X86ISD::PCMPESTRI;
10404      X86CC = X86::COND_B;
10405      break;
10406    case Intrinsic::x86_sse42_pcmpistrio128:
10407      Opcode = X86ISD::PCMPISTRI;
10408      X86CC = X86::COND_O;
10409      break;
10410    case Intrinsic::x86_sse42_pcmpestrio128:
10411      Opcode = X86ISD::PCMPESTRI;
10412      X86CC = X86::COND_O;
10413      break;
10414    case Intrinsic::x86_sse42_pcmpistris128:
10415      Opcode = X86ISD::PCMPISTRI;
10416      X86CC = X86::COND_S;
10417      break;
10418    case Intrinsic::x86_sse42_pcmpestris128:
10419      Opcode = X86ISD::PCMPESTRI;
10420      X86CC = X86::COND_S;
10421      break;
10422    case Intrinsic::x86_sse42_pcmpistriz128:
10423      Opcode = X86ISD::PCMPISTRI;
10424      X86CC = X86::COND_E;
10425      break;
10426    case Intrinsic::x86_sse42_pcmpestriz128:
10427      Opcode = X86ISD::PCMPESTRI;
10428      X86CC = X86::COND_E;
10429      break;
10430    }
10431    SmallVector<SDValue, 5> NewOps;
10432    NewOps.append(Op->op_begin()+1, Op->op_end());
10433    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10434    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10435    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10436                                DAG.getConstant(X86CC, MVT::i8),
10437                                SDValue(PCMP.getNode(), 1));
10438    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10439  }
10440
10441  case Intrinsic::x86_sse42_pcmpistri128:
10442  case Intrinsic::x86_sse42_pcmpestri128: {
10443    unsigned Opcode;
10444    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
10445      Opcode = X86ISD::PCMPISTRI;
10446    else
10447      Opcode = X86ISD::PCMPESTRI;
10448
10449    SmallVector<SDValue, 5> NewOps;
10450    NewOps.append(Op->op_begin()+1, Op->op_end());
10451    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10452    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10453  }
10454  case Intrinsic::x86_fma_vfmadd_ps:
10455  case Intrinsic::x86_fma_vfmadd_pd:
10456  case Intrinsic::x86_fma_vfmsub_ps:
10457  case Intrinsic::x86_fma_vfmsub_pd:
10458  case Intrinsic::x86_fma_vfnmadd_ps:
10459  case Intrinsic::x86_fma_vfnmadd_pd:
10460  case Intrinsic::x86_fma_vfnmsub_ps:
10461  case Intrinsic::x86_fma_vfnmsub_pd:
10462  case Intrinsic::x86_fma_vfmaddsub_ps:
10463  case Intrinsic::x86_fma_vfmaddsub_pd:
10464  case Intrinsic::x86_fma_vfmsubadd_ps:
10465  case Intrinsic::x86_fma_vfmsubadd_pd:
10466  case Intrinsic::x86_fma_vfmadd_ps_256:
10467  case Intrinsic::x86_fma_vfmadd_pd_256:
10468  case Intrinsic::x86_fma_vfmsub_ps_256:
10469  case Intrinsic::x86_fma_vfmsub_pd_256:
10470  case Intrinsic::x86_fma_vfnmadd_ps_256:
10471  case Intrinsic::x86_fma_vfnmadd_pd_256:
10472  case Intrinsic::x86_fma_vfnmsub_ps_256:
10473  case Intrinsic::x86_fma_vfnmsub_pd_256:
10474  case Intrinsic::x86_fma_vfmaddsub_ps_256:
10475  case Intrinsic::x86_fma_vfmaddsub_pd_256:
10476  case Intrinsic::x86_fma_vfmsubadd_ps_256:
10477  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
10478    unsigned Opc;
10479    switch (IntNo) {
10480    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10481    case Intrinsic::x86_fma_vfmadd_ps:
10482    case Intrinsic::x86_fma_vfmadd_pd:
10483    case Intrinsic::x86_fma_vfmadd_ps_256:
10484    case Intrinsic::x86_fma_vfmadd_pd_256:
10485      Opc = X86ISD::FMADD;
10486      break;
10487    case Intrinsic::x86_fma_vfmsub_ps:
10488    case Intrinsic::x86_fma_vfmsub_pd:
10489    case Intrinsic::x86_fma_vfmsub_ps_256:
10490    case Intrinsic::x86_fma_vfmsub_pd_256:
10491      Opc = X86ISD::FMSUB;
10492      break;
10493    case Intrinsic::x86_fma_vfnmadd_ps:
10494    case Intrinsic::x86_fma_vfnmadd_pd:
10495    case Intrinsic::x86_fma_vfnmadd_ps_256:
10496    case Intrinsic::x86_fma_vfnmadd_pd_256:
10497      Opc = X86ISD::FNMADD;
10498      break;
10499    case Intrinsic::x86_fma_vfnmsub_ps:
10500    case Intrinsic::x86_fma_vfnmsub_pd:
10501    case Intrinsic::x86_fma_vfnmsub_ps_256:
10502    case Intrinsic::x86_fma_vfnmsub_pd_256:
10503      Opc = X86ISD::FNMSUB;
10504      break;
10505    case Intrinsic::x86_fma_vfmaddsub_ps:
10506    case Intrinsic::x86_fma_vfmaddsub_pd:
10507    case Intrinsic::x86_fma_vfmaddsub_ps_256:
10508    case Intrinsic::x86_fma_vfmaddsub_pd_256:
10509      Opc = X86ISD::FMADDSUB;
10510      break;
10511    case Intrinsic::x86_fma_vfmsubadd_ps:
10512    case Intrinsic::x86_fma_vfmsubadd_pd:
10513    case Intrinsic::x86_fma_vfmsubadd_ps_256:
10514    case Intrinsic::x86_fma_vfmsubadd_pd_256:
10515      Opc = X86ISD::FMSUBADD;
10516      break;
10517    }
10518
10519    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
10520                       Op.getOperand(2), Op.getOperand(3));
10521  }
10522  }
10523}
10524
10525static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
10526  DebugLoc dl = Op.getDebugLoc();
10527  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10528  switch (IntNo) {
10529  default: return SDValue();    // Don't custom lower most intrinsics.
10530
10531  // RDRAND intrinsics.
10532  case Intrinsic::x86_rdrand_16:
10533  case Intrinsic::x86_rdrand_32:
10534  case Intrinsic::x86_rdrand_64: {
10535    // Emit the node with the right value type.
10536    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
10537    SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
10538
10539    // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
10540    // return the value from Rand, which is always 0, casted to i32.
10541    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
10542                      DAG.getConstant(1, Op->getValueType(1)),
10543                      DAG.getConstant(X86::COND_B, MVT::i32),
10544                      SDValue(Result.getNode(), 1) };
10545    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
10546                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
10547                                  Ops, 4);
10548
10549    // Return { result, isValid, chain }.
10550    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
10551                       SDValue(Result.getNode(), 2));
10552  }
10553  }
10554}
10555
10556SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
10557                                           SelectionDAG &DAG) const {
10558  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
10559  MFI->setReturnAddressIsTaken(true);
10560
10561  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10562  DebugLoc dl = Op.getDebugLoc();
10563  EVT PtrVT = getPointerTy();
10564
10565  if (Depth > 0) {
10566    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10567    SDValue Offset =
10568      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
10569    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
10570                       DAG.getNode(ISD::ADD, dl, PtrVT,
10571                                   FrameAddr, Offset),
10572                       MachinePointerInfo(), false, false, false, 0);
10573  }
10574
10575  // Just load the return address.
10576  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
10577  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
10578                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
10579}
10580
10581SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
10582  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
10583  MFI->setFrameAddressIsTaken(true);
10584
10585  EVT VT = Op.getValueType();
10586  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
10587  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10588  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
10589  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
10590  while (Depth--)
10591    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
10592                            MachinePointerInfo(),
10593                            false, false, false, 0);
10594  return FrameAddr;
10595}
10596
10597SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
10598                                                     SelectionDAG &DAG) const {
10599  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
10600}
10601
10602SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
10603  SDValue Chain     = Op.getOperand(0);
10604  SDValue Offset    = Op.getOperand(1);
10605  SDValue Handler   = Op.getOperand(2);
10606  DebugLoc dl       = Op.getDebugLoc();
10607
10608  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
10609                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
10610                                     getPointerTy());
10611  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
10612
10613  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
10614                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
10615  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
10616  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
10617                       false, false, 0);
10618  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
10619
10620  return DAG.getNode(X86ISD::EH_RETURN, dl,
10621                     MVT::Other,
10622                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
10623}
10624
10625SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
10626                                               SelectionDAG &DAG) const {
10627  DebugLoc DL = Op.getDebugLoc();
10628  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
10629                     DAG.getVTList(MVT::i32, MVT::Other),
10630                     Op.getOperand(0), Op.getOperand(1));
10631}
10632
10633SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
10634                                                SelectionDAG &DAG) const {
10635  DebugLoc DL = Op.getDebugLoc();
10636  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
10637                     Op.getOperand(0), Op.getOperand(1));
10638}
10639
10640static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
10641  return Op.getOperand(0);
10642}
10643
10644SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
10645                                                SelectionDAG &DAG) const {
10646  SDValue Root = Op.getOperand(0);
10647  SDValue Trmp = Op.getOperand(1); // trampoline
10648  SDValue FPtr = Op.getOperand(2); // nested function
10649  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
10650  DebugLoc dl  = Op.getDebugLoc();
10651
10652  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10653  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
10654
10655  if (Subtarget->is64Bit()) {
10656    SDValue OutChains[6];
10657
10658    // Large code-model.
10659    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
10660    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
10661
10662    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
10663    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
10664
10665    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
10666
10667    // Load the pointer to the nested function into R11.
10668    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
10669    SDValue Addr = Trmp;
10670    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10671                                Addr, MachinePointerInfo(TrmpAddr),
10672                                false, false, 0);
10673
10674    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10675                       DAG.getConstant(2, MVT::i64));
10676    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
10677                                MachinePointerInfo(TrmpAddr, 2),
10678                                false, false, 2);
10679
10680    // Load the 'nest' parameter value into R10.
10681    // R10 is specified in X86CallingConv.td
10682    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
10683    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10684                       DAG.getConstant(10, MVT::i64));
10685    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10686                                Addr, MachinePointerInfo(TrmpAddr, 10),
10687                                false, false, 0);
10688
10689    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10690                       DAG.getConstant(12, MVT::i64));
10691    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
10692                                MachinePointerInfo(TrmpAddr, 12),
10693                                false, false, 2);
10694
10695    // Jump to the nested function.
10696    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
10697    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10698                       DAG.getConstant(20, MVT::i64));
10699    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10700                                Addr, MachinePointerInfo(TrmpAddr, 20),
10701                                false, false, 0);
10702
10703    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
10704    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10705                       DAG.getConstant(22, MVT::i64));
10706    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
10707                                MachinePointerInfo(TrmpAddr, 22),
10708                                false, false, 0);
10709
10710    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
10711  } else {
10712    const Function *Func =
10713      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
10714    CallingConv::ID CC = Func->getCallingConv();
10715    unsigned NestReg;
10716
10717    switch (CC) {
10718    default:
10719      llvm_unreachable("Unsupported calling convention");
10720    case CallingConv::C:
10721    case CallingConv::X86_StdCall: {
10722      // Pass 'nest' parameter in ECX.
10723      // Must be kept in sync with X86CallingConv.td
10724      NestReg = X86::ECX;
10725
10726      // Check that ECX wasn't needed by an 'inreg' parameter.
10727      FunctionType *FTy = Func->getFunctionType();
10728      const AttributeSet &Attrs = Func->getAttributes();
10729
10730      if (!Attrs.isEmpty() && !Func->isVarArg()) {
10731        unsigned InRegCount = 0;
10732        unsigned Idx = 1;
10733
10734        for (FunctionType::param_iterator I = FTy->param_begin(),
10735             E = FTy->param_end(); I != E; ++I, ++Idx)
10736          if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
10737            // FIXME: should only count parameters that are lowered to integers.
10738            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
10739
10740        if (InRegCount > 2) {
10741          report_fatal_error("Nest register in use - reduce number of inreg"
10742                             " parameters!");
10743        }
10744      }
10745      break;
10746    }
10747    case CallingConv::X86_FastCall:
10748    case CallingConv::X86_ThisCall:
10749    case CallingConv::Fast:
10750      // Pass 'nest' parameter in EAX.
10751      // Must be kept in sync with X86CallingConv.td
10752      NestReg = X86::EAX;
10753      break;
10754    }
10755
10756    SDValue OutChains[4];
10757    SDValue Addr, Disp;
10758
10759    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10760                       DAG.getConstant(10, MVT::i32));
10761    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
10762
10763    // This is storing the opcode for MOV32ri.
10764    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
10765    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
10766    OutChains[0] = DAG.getStore(Root, dl,
10767                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
10768                                Trmp, MachinePointerInfo(TrmpAddr),
10769                                false, false, 0);
10770
10771    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10772                       DAG.getConstant(1, MVT::i32));
10773    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
10774                                MachinePointerInfo(TrmpAddr, 1),
10775                                false, false, 1);
10776
10777    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
10778    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10779                       DAG.getConstant(5, MVT::i32));
10780    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
10781                                MachinePointerInfo(TrmpAddr, 5),
10782                                false, false, 1);
10783
10784    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10785                       DAG.getConstant(6, MVT::i32));
10786    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
10787                                MachinePointerInfo(TrmpAddr, 6),
10788                                false, false, 1);
10789
10790    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
10791  }
10792}
10793
10794SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
10795                                            SelectionDAG &DAG) const {
10796  /*
10797   The rounding mode is in bits 11:10 of FPSR, and has the following
10798   settings:
10799     00 Round to nearest
10800     01 Round to -inf
10801     10 Round to +inf
10802     11 Round to 0
10803
10804  FLT_ROUNDS, on the other hand, expects the following:
10805    -1 Undefined
10806     0 Round to 0
10807     1 Round to nearest
10808     2 Round to +inf
10809     3 Round to -inf
10810
10811  To perform the conversion, we do:
10812    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
10813  */
10814
10815  MachineFunction &MF = DAG.getMachineFunction();
10816  const TargetMachine &TM = MF.getTarget();
10817  const TargetFrameLowering &TFI = *TM.getFrameLowering();
10818  unsigned StackAlignment = TFI.getStackAlignment();
10819  EVT VT = Op.getValueType();
10820  DebugLoc DL = Op.getDebugLoc();
10821
10822  // Save FP Control Word to stack slot
10823  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
10824  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
10825
10826
10827  MachineMemOperand *MMO =
10828   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
10829                           MachineMemOperand::MOStore, 2, 2);
10830
10831  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
10832  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
10833                                          DAG.getVTList(MVT::Other),
10834                                          Ops, 2, MVT::i16, MMO);
10835
10836  // Load FP Control Word from stack slot
10837  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
10838                            MachinePointerInfo(), false, false, false, 0);
10839
10840  // Transform as necessary
10841  SDValue CWD1 =
10842    DAG.getNode(ISD::SRL, DL, MVT::i16,
10843                DAG.getNode(ISD::AND, DL, MVT::i16,
10844                            CWD, DAG.getConstant(0x800, MVT::i16)),
10845                DAG.getConstant(11, MVT::i8));
10846  SDValue CWD2 =
10847    DAG.getNode(ISD::SRL, DL, MVT::i16,
10848                DAG.getNode(ISD::AND, DL, MVT::i16,
10849                            CWD, DAG.getConstant(0x400, MVT::i16)),
10850                DAG.getConstant(9, MVT::i8));
10851
10852  SDValue RetVal =
10853    DAG.getNode(ISD::AND, DL, MVT::i16,
10854                DAG.getNode(ISD::ADD, DL, MVT::i16,
10855                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
10856                            DAG.getConstant(1, MVT::i16)),
10857                DAG.getConstant(3, MVT::i16));
10858
10859
10860  return DAG.getNode((VT.getSizeInBits() < 16 ?
10861                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
10862}
10863
10864static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
10865  EVT VT = Op.getValueType();
10866  EVT OpVT = VT;
10867  unsigned NumBits = VT.getSizeInBits();
10868  DebugLoc dl = Op.getDebugLoc();
10869
10870  Op = Op.getOperand(0);
10871  if (VT == MVT::i8) {
10872    // Zero extend to i32 since there is not an i8 bsr.
10873    OpVT = MVT::i32;
10874    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
10875  }
10876
10877  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
10878  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
10879  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
10880
10881  // If src is zero (i.e. bsr sets ZF), returns NumBits.
10882  SDValue Ops[] = {
10883    Op,
10884    DAG.getConstant(NumBits+NumBits-1, OpVT),
10885    DAG.getConstant(X86::COND_E, MVT::i8),
10886    Op.getValue(1)
10887  };
10888  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
10889
10890  // Finally xor with NumBits-1.
10891  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
10892
10893  if (VT == MVT::i8)
10894    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
10895  return Op;
10896}
10897
10898static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
10899  EVT VT = Op.getValueType();
10900  EVT OpVT = VT;
10901  unsigned NumBits = VT.getSizeInBits();
10902  DebugLoc dl = Op.getDebugLoc();
10903
10904  Op = Op.getOperand(0);
10905  if (VT == MVT::i8) {
10906    // Zero extend to i32 since there is not an i8 bsr.
10907    OpVT = MVT::i32;
10908    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
10909  }
10910
10911  // Issue a bsr (scan bits in reverse).
10912  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
10913  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
10914
10915  // And xor with NumBits-1.
10916  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
10917
10918  if (VT == MVT::i8)
10919    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
10920  return Op;
10921}
10922
10923static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
10924  EVT VT = Op.getValueType();
10925  unsigned NumBits = VT.getSizeInBits();
10926  DebugLoc dl = Op.getDebugLoc();
10927  Op = Op.getOperand(0);
10928
10929  // Issue a bsf (scan bits forward) which also sets EFLAGS.
10930  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
10931  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
10932
10933  // If src is zero (i.e. bsf sets ZF), returns NumBits.
10934  SDValue Ops[] = {
10935    Op,
10936    DAG.getConstant(NumBits, VT),
10937    DAG.getConstant(X86::COND_E, MVT::i8),
10938    Op.getValue(1)
10939  };
10940  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
10941}
10942
10943// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
10944// ones, and then concatenate the result back.
10945static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
10946  EVT VT = Op.getValueType();
10947
10948  assert(VT.is256BitVector() && VT.isInteger() &&
10949         "Unsupported value type for operation");
10950
10951  unsigned NumElems = VT.getVectorNumElements();
10952  DebugLoc dl = Op.getDebugLoc();
10953
10954  // Extract the LHS vectors
10955  SDValue LHS = Op.getOperand(0);
10956  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
10957  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
10958
10959  // Extract the RHS vectors
10960  SDValue RHS = Op.getOperand(1);
10961  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
10962  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
10963
10964  MVT EltVT = VT.getVectorElementType().getSimpleVT();
10965  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10966
10967  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
10968                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
10969                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
10970}
10971
10972static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
10973  assert(Op.getValueType().is256BitVector() &&
10974         Op.getValueType().isInteger() &&
10975         "Only handle AVX 256-bit vector integer operation");
10976  return Lower256IntArith(Op, DAG);
10977}
10978
10979static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
10980  assert(Op.getValueType().is256BitVector() &&
10981         Op.getValueType().isInteger() &&
10982         "Only handle AVX 256-bit vector integer operation");
10983  return Lower256IntArith(Op, DAG);
10984}
10985
10986static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
10987                        SelectionDAG &DAG) {
10988  EVT VT = Op.getValueType();
10989
10990  // Decompose 256-bit ops into smaller 128-bit ops.
10991  if (VT.is256BitVector() && !Subtarget->hasInt256())
10992    return Lower256IntArith(Op, DAG);
10993
10994  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
10995         "Only know how to lower V2I64/V4I64 multiply");
10996
10997  DebugLoc dl = Op.getDebugLoc();
10998
10999  //  Ahi = psrlqi(a, 32);
11000  //  Bhi = psrlqi(b, 32);
11001  //
11002  //  AloBlo = pmuludq(a, b);
11003  //  AloBhi = pmuludq(a, Bhi);
11004  //  AhiBlo = pmuludq(Ahi, b);
11005
11006  //  AloBhi = psllqi(AloBhi, 32);
11007  //  AhiBlo = psllqi(AhiBlo, 32);
11008  //  return AloBlo + AloBhi + AhiBlo;
11009
11010  SDValue A = Op.getOperand(0);
11011  SDValue B = Op.getOperand(1);
11012
11013  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
11014
11015  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
11016  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
11017
11018  // Bit cast to 32-bit vectors for MULUDQ
11019  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
11020  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
11021  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
11022  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
11023  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
11024
11025  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
11026  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
11027  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
11028
11029  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
11030  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
11031
11032  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
11033  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
11034}
11035
11036SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
11037
11038  EVT VT = Op.getValueType();
11039  DebugLoc dl = Op.getDebugLoc();
11040  SDValue R = Op.getOperand(0);
11041  SDValue Amt = Op.getOperand(1);
11042  LLVMContext *Context = DAG.getContext();
11043
11044  if (!Subtarget->hasSSE2())
11045    return SDValue();
11046
11047  // Optimize shl/srl/sra with constant shift amount.
11048  if (isSplatVector(Amt.getNode())) {
11049    SDValue SclrAmt = Amt->getOperand(0);
11050    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
11051      uint64_t ShiftAmt = C->getZExtValue();
11052
11053      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
11054          (Subtarget->hasInt256() &&
11055           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
11056        if (Op.getOpcode() == ISD::SHL)
11057          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
11058                             DAG.getConstant(ShiftAmt, MVT::i32));
11059        if (Op.getOpcode() == ISD::SRL)
11060          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
11061                             DAG.getConstant(ShiftAmt, MVT::i32));
11062        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
11063          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
11064                             DAG.getConstant(ShiftAmt, MVT::i32));
11065      }
11066
11067      if (VT == MVT::v16i8) {
11068        if (Op.getOpcode() == ISD::SHL) {
11069          // Make a large shift.
11070          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
11071                                    DAG.getConstant(ShiftAmt, MVT::i32));
11072          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
11073          // Zero out the rightmost bits.
11074          SmallVector<SDValue, 16> V(16,
11075                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
11076                                                     MVT::i8));
11077          return DAG.getNode(ISD::AND, dl, VT, SHL,
11078                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
11079        }
11080        if (Op.getOpcode() == ISD::SRL) {
11081          // Make a large shift.
11082          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
11083                                    DAG.getConstant(ShiftAmt, MVT::i32));
11084          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
11085          // Zero out the leftmost bits.
11086          SmallVector<SDValue, 16> V(16,
11087                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
11088                                                     MVT::i8));
11089          return DAG.getNode(ISD::AND, dl, VT, SRL,
11090                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
11091        }
11092        if (Op.getOpcode() == ISD::SRA) {
11093          if (ShiftAmt == 7) {
11094            // R s>> 7  ===  R s< 0
11095            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
11096            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
11097          }
11098
11099          // R s>> a === ((R u>> a) ^ m) - m
11100          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
11101          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
11102                                                         MVT::i8));
11103          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
11104          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
11105          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
11106          return Res;
11107        }
11108        llvm_unreachable("Unknown shift opcode.");
11109      }
11110
11111      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
11112        if (Op.getOpcode() == ISD::SHL) {
11113          // Make a large shift.
11114          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
11115                                    DAG.getConstant(ShiftAmt, MVT::i32));
11116          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
11117          // Zero out the rightmost bits.
11118          SmallVector<SDValue, 32> V(32,
11119                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
11120                                                     MVT::i8));
11121          return DAG.getNode(ISD::AND, dl, VT, SHL,
11122                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
11123        }
11124        if (Op.getOpcode() == ISD::SRL) {
11125          // Make a large shift.
11126          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
11127                                    DAG.getConstant(ShiftAmt, MVT::i32));
11128          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
11129          // Zero out the leftmost bits.
11130          SmallVector<SDValue, 32> V(32,
11131                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
11132                                                     MVT::i8));
11133          return DAG.getNode(ISD::AND, dl, VT, SRL,
11134                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
11135        }
11136        if (Op.getOpcode() == ISD::SRA) {
11137          if (ShiftAmt == 7) {
11138            // R s>> 7  ===  R s< 0
11139            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
11140            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
11141          }
11142
11143          // R s>> a === ((R u>> a) ^ m) - m
11144          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
11145          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
11146                                                         MVT::i8));
11147          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
11148          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
11149          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
11150          return Res;
11151        }
11152        llvm_unreachable("Unknown shift opcode.");
11153      }
11154    }
11155  }
11156
11157  // Lower SHL with variable shift amount.
11158  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
11159    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
11160                     DAG.getConstant(23, MVT::i32));
11161
11162    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
11163    Constant *C = ConstantDataVector::get(*Context, CV);
11164    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
11165    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
11166                                 MachinePointerInfo::getConstantPool(),
11167                                 false, false, false, 16);
11168
11169    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
11170    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
11171    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
11172    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
11173  }
11174  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
11175    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
11176
11177    // a = a << 5;
11178    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
11179                     DAG.getConstant(5, MVT::i32));
11180    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
11181
11182    // Turn 'a' into a mask suitable for VSELECT
11183    SDValue VSelM = DAG.getConstant(0x80, VT);
11184    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11185    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11186
11187    SDValue CM1 = DAG.getConstant(0x0f, VT);
11188    SDValue CM2 = DAG.getConstant(0x3f, VT);
11189
11190    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
11191    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
11192    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
11193                            DAG.getConstant(4, MVT::i32), DAG);
11194    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
11195    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
11196
11197    // a += a
11198    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
11199    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11200    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11201
11202    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
11203    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
11204    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
11205                            DAG.getConstant(2, MVT::i32), DAG);
11206    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
11207    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
11208
11209    // a += a
11210    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
11211    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11212    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11213
11214    // return VSELECT(r, r+r, a);
11215    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
11216                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
11217    return R;
11218  }
11219
11220  // Decompose 256-bit shifts into smaller 128-bit shifts.
11221  if (VT.is256BitVector()) {
11222    unsigned NumElems = VT.getVectorNumElements();
11223    MVT EltVT = VT.getVectorElementType().getSimpleVT();
11224    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
11225
11226    // Extract the two vectors
11227    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
11228    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
11229
11230    // Recreate the shift amount vectors
11231    SDValue Amt1, Amt2;
11232    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
11233      // Constant shift amount
11234      SmallVector<SDValue, 4> Amt1Csts;
11235      SmallVector<SDValue, 4> Amt2Csts;
11236      for (unsigned i = 0; i != NumElems/2; ++i)
11237        Amt1Csts.push_back(Amt->getOperand(i));
11238      for (unsigned i = NumElems/2; i != NumElems; ++i)
11239        Amt2Csts.push_back(Amt->getOperand(i));
11240
11241      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
11242                                 &Amt1Csts[0], NumElems/2);
11243      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
11244                                 &Amt2Csts[0], NumElems/2);
11245    } else {
11246      // Variable shift amount
11247      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
11248      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
11249    }
11250
11251    // Issue new vector shifts for the smaller types
11252    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
11253    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
11254
11255    // Concatenate the result back
11256    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
11257  }
11258
11259  return SDValue();
11260}
11261
11262static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
11263  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
11264  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
11265  // looks for this combo and may remove the "setcc" instruction if the "setcc"
11266  // has only one use.
11267  SDNode *N = Op.getNode();
11268  SDValue LHS = N->getOperand(0);
11269  SDValue RHS = N->getOperand(1);
11270  unsigned BaseOp = 0;
11271  unsigned Cond = 0;
11272  DebugLoc DL = Op.getDebugLoc();
11273  switch (Op.getOpcode()) {
11274  default: llvm_unreachable("Unknown ovf instruction!");
11275  case ISD::SADDO:
11276    // A subtract of one will be selected as a INC. Note that INC doesn't
11277    // set CF, so we can't do this for UADDO.
11278    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
11279      if (C->isOne()) {
11280        BaseOp = X86ISD::INC;
11281        Cond = X86::COND_O;
11282        break;
11283      }
11284    BaseOp = X86ISD::ADD;
11285    Cond = X86::COND_O;
11286    break;
11287  case ISD::UADDO:
11288    BaseOp = X86ISD::ADD;
11289    Cond = X86::COND_B;
11290    break;
11291  case ISD::SSUBO:
11292    // A subtract of one will be selected as a DEC. Note that DEC doesn't
11293    // set CF, so we can't do this for USUBO.
11294    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
11295      if (C->isOne()) {
11296        BaseOp = X86ISD::DEC;
11297        Cond = X86::COND_O;
11298        break;
11299      }
11300    BaseOp = X86ISD::SUB;
11301    Cond = X86::COND_O;
11302    break;
11303  case ISD::USUBO:
11304    BaseOp = X86ISD::SUB;
11305    Cond = X86::COND_B;
11306    break;
11307  case ISD::SMULO:
11308    BaseOp = X86ISD::SMUL;
11309    Cond = X86::COND_O;
11310    break;
11311  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
11312    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
11313                                 MVT::i32);
11314    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
11315
11316    SDValue SetCC =
11317      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
11318                  DAG.getConstant(X86::COND_O, MVT::i32),
11319                  SDValue(Sum.getNode(), 2));
11320
11321    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
11322  }
11323  }
11324
11325  // Also sets EFLAGS.
11326  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
11327  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
11328
11329  SDValue SetCC =
11330    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
11331                DAG.getConstant(Cond, MVT::i32),
11332                SDValue(Sum.getNode(), 1));
11333
11334  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
11335}
11336
11337SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
11338                                                  SelectionDAG &DAG) const {
11339  DebugLoc dl = Op.getDebugLoc();
11340  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
11341  EVT VT = Op.getValueType();
11342
11343  if (!Subtarget->hasSSE2() || !VT.isVector())
11344    return SDValue();
11345
11346  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
11347                      ExtraVT.getScalarType().getSizeInBits();
11348  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
11349
11350  switch (VT.getSimpleVT().SimpleTy) {
11351    default: return SDValue();
11352    case MVT::v8i32:
11353    case MVT::v16i16:
11354      if (!Subtarget->hasFp256())
11355        return SDValue();
11356      if (!Subtarget->hasInt256()) {
11357        // needs to be split
11358        unsigned NumElems = VT.getVectorNumElements();
11359
11360        // Extract the LHS vectors
11361        SDValue LHS = Op.getOperand(0);
11362        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
11363        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
11364
11365        MVT EltVT = VT.getVectorElementType().getSimpleVT();
11366        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
11367
11368        EVT ExtraEltVT = ExtraVT.getVectorElementType();
11369        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
11370        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
11371                                   ExtraNumElems/2);
11372        SDValue Extra = DAG.getValueType(ExtraVT);
11373
11374        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
11375        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
11376
11377        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
11378      }
11379      // fall through
11380    case MVT::v4i32:
11381    case MVT::v8i16: {
11382      SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
11383                                         Op.getOperand(0), ShAmt, DAG);
11384      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
11385    }
11386  }
11387}
11388
11389
11390static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
11391                              SelectionDAG &DAG) {
11392  DebugLoc dl = Op.getDebugLoc();
11393
11394  // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
11395  // There isn't any reason to disable it if the target processor supports it.
11396  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
11397    SDValue Chain = Op.getOperand(0);
11398    SDValue Zero = DAG.getConstant(0, MVT::i32);
11399    SDValue Ops[] = {
11400      DAG.getRegister(X86::ESP, MVT::i32), // Base
11401      DAG.getTargetConstant(1, MVT::i8),   // Scale
11402      DAG.getRegister(0, MVT::i32),        // Index
11403      DAG.getTargetConstant(0, MVT::i32),  // Disp
11404      DAG.getRegister(0, MVT::i32),        // Segment.
11405      Zero,
11406      Chain
11407    };
11408    SDNode *Res =
11409      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
11410                          array_lengthof(Ops));
11411    return SDValue(Res, 0);
11412  }
11413
11414  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
11415  if (!isDev)
11416    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
11417
11418  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11419  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
11420  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
11421  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
11422
11423  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
11424  if (!Op1 && !Op2 && !Op3 && Op4)
11425    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
11426
11427  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
11428  if (Op1 && !Op2 && !Op3 && !Op4)
11429    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
11430
11431  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
11432  //           (MFENCE)>;
11433  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
11434}
11435
11436static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
11437                                 SelectionDAG &DAG) {
11438  DebugLoc dl = Op.getDebugLoc();
11439  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
11440    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
11441  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
11442    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
11443
11444  // The only fence that needs an instruction is a sequentially-consistent
11445  // cross-thread fence.
11446  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
11447    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
11448    // no-sse2). There isn't any reason to disable it if the target processor
11449    // supports it.
11450    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
11451      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
11452
11453    SDValue Chain = Op.getOperand(0);
11454    SDValue Zero = DAG.getConstant(0, MVT::i32);
11455    SDValue Ops[] = {
11456      DAG.getRegister(X86::ESP, MVT::i32), // Base
11457      DAG.getTargetConstant(1, MVT::i8),   // Scale
11458      DAG.getRegister(0, MVT::i32),        // Index
11459      DAG.getTargetConstant(0, MVT::i32),  // Disp
11460      DAG.getRegister(0, MVT::i32),        // Segment.
11461      Zero,
11462      Chain
11463    };
11464    SDNode *Res =
11465      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
11466                         array_lengthof(Ops));
11467    return SDValue(Res, 0);
11468  }
11469
11470  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
11471  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
11472}
11473
11474
11475static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
11476                             SelectionDAG &DAG) {
11477  EVT T = Op.getValueType();
11478  DebugLoc DL = Op.getDebugLoc();
11479  unsigned Reg = 0;
11480  unsigned size = 0;
11481  switch(T.getSimpleVT().SimpleTy) {
11482  default: llvm_unreachable("Invalid value type!");
11483  case MVT::i8:  Reg = X86::AL;  size = 1; break;
11484  case MVT::i16: Reg = X86::AX;  size = 2; break;
11485  case MVT::i32: Reg = X86::EAX; size = 4; break;
11486  case MVT::i64:
11487    assert(Subtarget->is64Bit() && "Node not type legal!");
11488    Reg = X86::RAX; size = 8;
11489    break;
11490  }
11491  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
11492                                    Op.getOperand(2), SDValue());
11493  SDValue Ops[] = { cpIn.getValue(0),
11494                    Op.getOperand(1),
11495                    Op.getOperand(3),
11496                    DAG.getTargetConstant(size, MVT::i8),
11497                    cpIn.getValue(1) };
11498  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11499  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
11500  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
11501                                           Ops, 5, T, MMO);
11502  SDValue cpOut =
11503    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
11504  return cpOut;
11505}
11506
11507static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
11508                                     SelectionDAG &DAG) {
11509  assert(Subtarget->is64Bit() && "Result not type legalized?");
11510  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11511  SDValue TheChain = Op.getOperand(0);
11512  DebugLoc dl = Op.getDebugLoc();
11513  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11514  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
11515  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
11516                                   rax.getValue(2));
11517  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
11518                            DAG.getConstant(32, MVT::i8));
11519  SDValue Ops[] = {
11520    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
11521    rdx.getValue(1)
11522  };
11523  return DAG.getMergeValues(Ops, 2, dl);
11524}
11525
11526SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
11527  EVT SrcVT = Op.getOperand(0).getValueType();
11528  EVT DstVT = Op.getValueType();
11529  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
11530         Subtarget->hasMMX() && "Unexpected custom BITCAST");
11531  assert((DstVT == MVT::i64 ||
11532          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
11533         "Unexpected custom BITCAST");
11534  // i64 <=> MMX conversions are Legal.
11535  if (SrcVT==MVT::i64 && DstVT.isVector())
11536    return Op;
11537  if (DstVT==MVT::i64 && SrcVT.isVector())
11538    return Op;
11539  // MMX <=> MMX conversions are Legal.
11540  if (SrcVT.isVector() && DstVT.isVector())
11541    return Op;
11542  // All other conversions need to be expanded.
11543  return SDValue();
11544}
11545
11546static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
11547  SDNode *Node = Op.getNode();
11548  DebugLoc dl = Node->getDebugLoc();
11549  EVT T = Node->getValueType(0);
11550  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
11551                              DAG.getConstant(0, T), Node->getOperand(2));
11552  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
11553                       cast<AtomicSDNode>(Node)->getMemoryVT(),
11554                       Node->getOperand(0),
11555                       Node->getOperand(1), negOp,
11556                       cast<AtomicSDNode>(Node)->getSrcValue(),
11557                       cast<AtomicSDNode>(Node)->getAlignment(),
11558                       cast<AtomicSDNode>(Node)->getOrdering(),
11559                       cast<AtomicSDNode>(Node)->getSynchScope());
11560}
11561
11562static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
11563  SDNode *Node = Op.getNode();
11564  DebugLoc dl = Node->getDebugLoc();
11565  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
11566
11567  // Convert seq_cst store -> xchg
11568  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
11569  // FIXME: On 32-bit, store -> fist or movq would be more efficient
11570  //        (The only way to get a 16-byte store is cmpxchg16b)
11571  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
11572  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
11573      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
11574    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
11575                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
11576                                 Node->getOperand(0),
11577                                 Node->getOperand(1), Node->getOperand(2),
11578                                 cast<AtomicSDNode>(Node)->getMemOperand(),
11579                                 cast<AtomicSDNode>(Node)->getOrdering(),
11580                                 cast<AtomicSDNode>(Node)->getSynchScope());
11581    return Swap.getValue(1);
11582  }
11583  // Other atomic stores have a simple pattern.
11584  return Op;
11585}
11586
11587static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
11588  EVT VT = Op.getNode()->getValueType(0);
11589
11590  // Let legalize expand this if it isn't a legal type yet.
11591  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11592    return SDValue();
11593
11594  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
11595
11596  unsigned Opc;
11597  bool ExtraOp = false;
11598  switch (Op.getOpcode()) {
11599  default: llvm_unreachable("Invalid code");
11600  case ISD::ADDC: Opc = X86ISD::ADD; break;
11601  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
11602  case ISD::SUBC: Opc = X86ISD::SUB; break;
11603  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
11604  }
11605
11606  if (!ExtraOp)
11607    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
11608                       Op.getOperand(1));
11609  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
11610                     Op.getOperand(1), Op.getOperand(2));
11611}
11612
11613/// LowerOperation - Provide custom lowering hooks for some operations.
11614///
11615SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
11616  switch (Op.getOpcode()) {
11617  default: llvm_unreachable("Should not custom lower this!");
11618  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
11619  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, Subtarget, DAG);
11620  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
11621  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
11622  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
11623  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
11624  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
11625  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
11626  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
11627  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
11628  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
11629  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
11630  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
11631  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
11632  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
11633  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
11634  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
11635  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
11636  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
11637  case ISD::SHL_PARTS:
11638  case ISD::SRA_PARTS:
11639  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
11640  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
11641  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
11642  case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
11643  case ISD::ZERO_EXTEND:        return lowerZERO_EXTEND(Op, DAG);
11644  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
11645  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
11646  case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
11647  case ISD::FABS:               return LowerFABS(Op, DAG);
11648  case ISD::FNEG:               return LowerFNEG(Op, DAG);
11649  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
11650  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
11651  case ISD::SETCC:              return LowerSETCC(Op, DAG);
11652  case ISD::SELECT:             return LowerSELECT(Op, DAG);
11653  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
11654  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
11655  case ISD::VASTART:            return LowerVASTART(Op, DAG);
11656  case ISD::VAARG:              return LowerVAARG(Op, DAG);
11657  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
11658  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
11659  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
11660  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
11661  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
11662  case ISD::FRAME_TO_ARGS_OFFSET:
11663                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
11664  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
11665  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
11666  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
11667  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
11668  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
11669  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
11670  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
11671  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
11672  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
11673  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
11674  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
11675  case ISD::SRA:
11676  case ISD::SRL:
11677  case ISD::SHL:                return LowerShift(Op, DAG);
11678  case ISD::SADDO:
11679  case ISD::UADDO:
11680  case ISD::SSUBO:
11681  case ISD::USUBO:
11682  case ISD::SMULO:
11683  case ISD::UMULO:              return LowerXALUO(Op, DAG);
11684  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
11685  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
11686  case ISD::ADDC:
11687  case ISD::ADDE:
11688  case ISD::SUBC:
11689  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
11690  case ISD::ADD:                return LowerADD(Op, DAG);
11691  case ISD::SUB:                return LowerSUB(Op, DAG);
11692  }
11693}
11694
11695static void ReplaceATOMIC_LOAD(SDNode *Node,
11696                                  SmallVectorImpl<SDValue> &Results,
11697                                  SelectionDAG &DAG) {
11698  DebugLoc dl = Node->getDebugLoc();
11699  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
11700
11701  // Convert wide load -> cmpxchg8b/cmpxchg16b
11702  // FIXME: On 32-bit, load -> fild or movq would be more efficient
11703  //        (The only way to get a 16-byte load is cmpxchg16b)
11704  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
11705  SDValue Zero = DAG.getConstant(0, VT);
11706  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
11707                               Node->getOperand(0),
11708                               Node->getOperand(1), Zero, Zero,
11709                               cast<AtomicSDNode>(Node)->getMemOperand(),
11710                               cast<AtomicSDNode>(Node)->getOrdering(),
11711                               cast<AtomicSDNode>(Node)->getSynchScope());
11712  Results.push_back(Swap.getValue(0));
11713  Results.push_back(Swap.getValue(1));
11714}
11715
11716static void
11717ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
11718                        SelectionDAG &DAG, unsigned NewOp) {
11719  DebugLoc dl = Node->getDebugLoc();
11720  assert (Node->getValueType(0) == MVT::i64 &&
11721          "Only know how to expand i64 atomics");
11722
11723  SDValue Chain = Node->getOperand(0);
11724  SDValue In1 = Node->getOperand(1);
11725  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11726                             Node->getOperand(2), DAG.getIntPtrConstant(0));
11727  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11728                             Node->getOperand(2), DAG.getIntPtrConstant(1));
11729  SDValue Ops[] = { Chain, In1, In2L, In2H };
11730  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
11731  SDValue Result =
11732    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
11733                            cast<MemSDNode>(Node)->getMemOperand());
11734  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
11735  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
11736  Results.push_back(Result.getValue(2));
11737}
11738
11739/// ReplaceNodeResults - Replace a node with an illegal result type
11740/// with a new node built out of custom code.
11741void X86TargetLowering::ReplaceNodeResults(SDNode *N,
11742                                           SmallVectorImpl<SDValue>&Results,
11743                                           SelectionDAG &DAG) const {
11744  DebugLoc dl = N->getDebugLoc();
11745  switch (N->getOpcode()) {
11746  default:
11747    llvm_unreachable("Do not know how to custom type legalize this operation!");
11748  case ISD::SIGN_EXTEND_INREG:
11749  case ISD::ADDC:
11750  case ISD::ADDE:
11751  case ISD::SUBC:
11752  case ISD::SUBE:
11753    // We don't want to expand or promote these.
11754    return;
11755  case ISD::FP_TO_SINT:
11756  case ISD::FP_TO_UINT: {
11757    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
11758
11759    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
11760      return;
11761
11762    std::pair<SDValue,SDValue> Vals =
11763        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
11764    SDValue FIST = Vals.first, StackSlot = Vals.second;
11765    if (FIST.getNode() != 0) {
11766      EVT VT = N->getValueType(0);
11767      // Return a load from the stack slot.
11768      if (StackSlot.getNode() != 0)
11769        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
11770                                      MachinePointerInfo(),
11771                                      false, false, false, 0));
11772      else
11773        Results.push_back(FIST);
11774    }
11775    return;
11776  }
11777  case ISD::UINT_TO_FP: {
11778    if (N->getOperand(0).getValueType() != MVT::v2i32 &&
11779        N->getValueType(0) != MVT::v2f32)
11780      return;
11781    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
11782                                 N->getOperand(0));
11783    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
11784                                     MVT::f64);
11785    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
11786    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
11787                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
11788    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
11789    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
11790    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
11791    return;
11792  }
11793  case ISD::FP_ROUND: {
11794    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
11795    Results.push_back(V);
11796    return;
11797  }
11798  case ISD::READCYCLECOUNTER: {
11799    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11800    SDValue TheChain = N->getOperand(0);
11801    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11802    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
11803                                     rd.getValue(1));
11804    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
11805                                     eax.getValue(2));
11806    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
11807    SDValue Ops[] = { eax, edx };
11808    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
11809    Results.push_back(edx.getValue(1));
11810    return;
11811  }
11812  case ISD::ATOMIC_CMP_SWAP: {
11813    EVT T = N->getValueType(0);
11814    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
11815    bool Regs64bit = T == MVT::i128;
11816    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
11817    SDValue cpInL, cpInH;
11818    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11819                        DAG.getConstant(0, HalfT));
11820    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11821                        DAG.getConstant(1, HalfT));
11822    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
11823                             Regs64bit ? X86::RAX : X86::EAX,
11824                             cpInL, SDValue());
11825    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
11826                             Regs64bit ? X86::RDX : X86::EDX,
11827                             cpInH, cpInL.getValue(1));
11828    SDValue swapInL, swapInH;
11829    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11830                          DAG.getConstant(0, HalfT));
11831    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11832                          DAG.getConstant(1, HalfT));
11833    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
11834                               Regs64bit ? X86::RBX : X86::EBX,
11835                               swapInL, cpInH.getValue(1));
11836    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
11837                               Regs64bit ? X86::RCX : X86::ECX,
11838                               swapInH, swapInL.getValue(1));
11839    SDValue Ops[] = { swapInH.getValue(0),
11840                      N->getOperand(1),
11841                      swapInH.getValue(1) };
11842    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11843    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
11844    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
11845                                  X86ISD::LCMPXCHG8_DAG;
11846    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
11847                                             Ops, 3, T, MMO);
11848    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
11849                                        Regs64bit ? X86::RAX : X86::EAX,
11850                                        HalfT, Result.getValue(1));
11851    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
11852                                        Regs64bit ? X86::RDX : X86::EDX,
11853                                        HalfT, cpOutL.getValue(2));
11854    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
11855    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
11856    Results.push_back(cpOutH.getValue(1));
11857    return;
11858  }
11859  case ISD::ATOMIC_LOAD_ADD:
11860  case ISD::ATOMIC_LOAD_AND:
11861  case ISD::ATOMIC_LOAD_NAND:
11862  case ISD::ATOMIC_LOAD_OR:
11863  case ISD::ATOMIC_LOAD_SUB:
11864  case ISD::ATOMIC_LOAD_XOR:
11865  case ISD::ATOMIC_LOAD_MAX:
11866  case ISD::ATOMIC_LOAD_MIN:
11867  case ISD::ATOMIC_LOAD_UMAX:
11868  case ISD::ATOMIC_LOAD_UMIN:
11869  case ISD::ATOMIC_SWAP: {
11870    unsigned Opc;
11871    switch (N->getOpcode()) {
11872    default: llvm_unreachable("Unexpected opcode");
11873    case ISD::ATOMIC_LOAD_ADD:
11874      Opc = X86ISD::ATOMADD64_DAG;
11875      break;
11876    case ISD::ATOMIC_LOAD_AND:
11877      Opc = X86ISD::ATOMAND64_DAG;
11878      break;
11879    case ISD::ATOMIC_LOAD_NAND:
11880      Opc = X86ISD::ATOMNAND64_DAG;
11881      break;
11882    case ISD::ATOMIC_LOAD_OR:
11883      Opc = X86ISD::ATOMOR64_DAG;
11884      break;
11885    case ISD::ATOMIC_LOAD_SUB:
11886      Opc = X86ISD::ATOMSUB64_DAG;
11887      break;
11888    case ISD::ATOMIC_LOAD_XOR:
11889      Opc = X86ISD::ATOMXOR64_DAG;
11890      break;
11891    case ISD::ATOMIC_LOAD_MAX:
11892      Opc = X86ISD::ATOMMAX64_DAG;
11893      break;
11894    case ISD::ATOMIC_LOAD_MIN:
11895      Opc = X86ISD::ATOMMIN64_DAG;
11896      break;
11897    case ISD::ATOMIC_LOAD_UMAX:
11898      Opc = X86ISD::ATOMUMAX64_DAG;
11899      break;
11900    case ISD::ATOMIC_LOAD_UMIN:
11901      Opc = X86ISD::ATOMUMIN64_DAG;
11902      break;
11903    case ISD::ATOMIC_SWAP:
11904      Opc = X86ISD::ATOMSWAP64_DAG;
11905      break;
11906    }
11907    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
11908    return;
11909  }
11910  case ISD::ATOMIC_LOAD:
11911    ReplaceATOMIC_LOAD(N, Results, DAG);
11912  }
11913}
11914
11915const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
11916  switch (Opcode) {
11917  default: return NULL;
11918  case X86ISD::BSF:                return "X86ISD::BSF";
11919  case X86ISD::BSR:                return "X86ISD::BSR";
11920  case X86ISD::SHLD:               return "X86ISD::SHLD";
11921  case X86ISD::SHRD:               return "X86ISD::SHRD";
11922  case X86ISD::FAND:               return "X86ISD::FAND";
11923  case X86ISD::FOR:                return "X86ISD::FOR";
11924  case X86ISD::FXOR:               return "X86ISD::FXOR";
11925  case X86ISD::FSRL:               return "X86ISD::FSRL";
11926  case X86ISD::FILD:               return "X86ISD::FILD";
11927  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
11928  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
11929  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
11930  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
11931  case X86ISD::FLD:                return "X86ISD::FLD";
11932  case X86ISD::FST:                return "X86ISD::FST";
11933  case X86ISD::CALL:               return "X86ISD::CALL";
11934  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
11935  case X86ISD::BT:                 return "X86ISD::BT";
11936  case X86ISD::CMP:                return "X86ISD::CMP";
11937  case X86ISD::COMI:               return "X86ISD::COMI";
11938  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
11939  case X86ISD::SETCC:              return "X86ISD::SETCC";
11940  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
11941  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
11942  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
11943  case X86ISD::CMOV:               return "X86ISD::CMOV";
11944  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
11945  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
11946  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
11947  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
11948  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
11949  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
11950  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
11951  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
11952  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
11953  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
11954  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
11955  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
11956  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
11957  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
11958  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
11959  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
11960  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
11961  case X86ISD::HADD:               return "X86ISD::HADD";
11962  case X86ISD::HSUB:               return "X86ISD::HSUB";
11963  case X86ISD::FHADD:              return "X86ISD::FHADD";
11964  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
11965  case X86ISD::FMAX:               return "X86ISD::FMAX";
11966  case X86ISD::FMIN:               return "X86ISD::FMIN";
11967  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
11968  case X86ISD::FMINC:              return "X86ISD::FMINC";
11969  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
11970  case X86ISD::FRCP:               return "X86ISD::FRCP";
11971  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
11972  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
11973  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
11974  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
11975  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
11976  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
11977  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
11978  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
11979  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
11980  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
11981  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
11982  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
11983  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
11984  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
11985  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
11986  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
11987  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
11988  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
11989  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
11990  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
11991  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
11992  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
11993  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
11994  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
11995  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
11996  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
11997  case X86ISD::VSHL:               return "X86ISD::VSHL";
11998  case X86ISD::VSRL:               return "X86ISD::VSRL";
11999  case X86ISD::VSRA:               return "X86ISD::VSRA";
12000  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
12001  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
12002  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
12003  case X86ISD::CMPP:               return "X86ISD::CMPP";
12004  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
12005  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
12006  case X86ISD::ADD:                return "X86ISD::ADD";
12007  case X86ISD::SUB:                return "X86ISD::SUB";
12008  case X86ISD::ADC:                return "X86ISD::ADC";
12009  case X86ISD::SBB:                return "X86ISD::SBB";
12010  case X86ISD::SMUL:               return "X86ISD::SMUL";
12011  case X86ISD::UMUL:               return "X86ISD::UMUL";
12012  case X86ISD::INC:                return "X86ISD::INC";
12013  case X86ISD::DEC:                return "X86ISD::DEC";
12014  case X86ISD::OR:                 return "X86ISD::OR";
12015  case X86ISD::XOR:                return "X86ISD::XOR";
12016  case X86ISD::AND:                return "X86ISD::AND";
12017  case X86ISD::ANDN:               return "X86ISD::ANDN";
12018  case X86ISD::BLSI:               return "X86ISD::BLSI";
12019  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
12020  case X86ISD::BLSR:               return "X86ISD::BLSR";
12021  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
12022  case X86ISD::PTEST:              return "X86ISD::PTEST";
12023  case X86ISD::TESTP:              return "X86ISD::TESTP";
12024  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
12025  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
12026  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
12027  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
12028  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
12029  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
12030  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
12031  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
12032  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
12033  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
12034  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
12035  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
12036  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
12037  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
12038  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
12039  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
12040  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
12041  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
12042  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
12043  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
12044  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
12045  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
12046  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
12047  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
12048  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
12049  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
12050  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
12051  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
12052  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
12053  case X86ISD::SAHF:               return "X86ISD::SAHF";
12054  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
12055  case X86ISD::FMADD:              return "X86ISD::FMADD";
12056  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
12057  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
12058  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
12059  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
12060  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
12061  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
12062  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
12063  }
12064}
12065
12066// isLegalAddressingMode - Return true if the addressing mode represented
12067// by AM is legal for this target, for a load/store of the specified type.
12068bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
12069                                              Type *Ty) const {
12070  // X86 supports extremely general addressing modes.
12071  CodeModel::Model M = getTargetMachine().getCodeModel();
12072  Reloc::Model R = getTargetMachine().getRelocationModel();
12073
12074  // X86 allows a sign-extended 32-bit immediate field as a displacement.
12075  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
12076    return false;
12077
12078  if (AM.BaseGV) {
12079    unsigned GVFlags =
12080      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
12081
12082    // If a reference to this global requires an extra load, we can't fold it.
12083    if (isGlobalStubReference(GVFlags))
12084      return false;
12085
12086    // If BaseGV requires a register for the PIC base, we cannot also have a
12087    // BaseReg specified.
12088    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
12089      return false;
12090
12091    // If lower 4G is not available, then we must use rip-relative addressing.
12092    if ((M != CodeModel::Small || R != Reloc::Static) &&
12093        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
12094      return false;
12095  }
12096
12097  switch (AM.Scale) {
12098  case 0:
12099  case 1:
12100  case 2:
12101  case 4:
12102  case 8:
12103    // These scales always work.
12104    break;
12105  case 3:
12106  case 5:
12107  case 9:
12108    // These scales are formed with basereg+scalereg.  Only accept if there is
12109    // no basereg yet.
12110    if (AM.HasBaseReg)
12111      return false;
12112    break;
12113  default:  // Other stuff never works.
12114    return false;
12115  }
12116
12117  return true;
12118}
12119
12120
12121bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
12122  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
12123    return false;
12124  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
12125  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
12126  if (NumBits1 <= NumBits2)
12127    return false;
12128  return true;
12129}
12130
12131bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
12132  return Imm == (int32_t)Imm;
12133}
12134
12135bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
12136  // Can also use sub to handle negated immediates.
12137  return Imm == (int32_t)Imm;
12138}
12139
12140bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
12141  if (!VT1.isInteger() || !VT2.isInteger())
12142    return false;
12143  unsigned NumBits1 = VT1.getSizeInBits();
12144  unsigned NumBits2 = VT2.getSizeInBits();
12145  if (NumBits1 <= NumBits2)
12146    return false;
12147  return true;
12148}
12149
12150bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
12151  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
12152  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
12153}
12154
12155bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
12156  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
12157  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
12158}
12159
12160bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
12161  EVT VT1 = Val.getValueType();
12162  if (isZExtFree(VT1, VT2))
12163    return true;
12164
12165  if (Val.getOpcode() != ISD::LOAD)
12166    return false;
12167
12168  if (!VT1.isSimple() || !VT1.isInteger() ||
12169      !VT2.isSimple() || !VT2.isInteger())
12170    return false;
12171
12172  switch (VT1.getSimpleVT().SimpleTy) {
12173  default: break;
12174  case MVT::i8:
12175  case MVT::i16:
12176  case MVT::i32:
12177    // X86 has 8, 16, and 32-bit zero-extending loads.
12178    return true;
12179  }
12180
12181  return false;
12182}
12183
12184bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
12185  // i16 instructions are longer (0x66 prefix) and potentially slower.
12186  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
12187}
12188
12189/// isShuffleMaskLegal - Targets can use this to indicate that they only
12190/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
12191/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
12192/// are assumed to be legal.
12193bool
12194X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
12195                                      EVT VT) const {
12196  // Very little shuffling can be done for 64-bit vectors right now.
12197  if (VT.getSizeInBits() == 64)
12198    return false;
12199
12200  // FIXME: pshufb, blends, shifts.
12201  return (VT.getVectorNumElements() == 2 ||
12202          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
12203          isMOVLMask(M, VT) ||
12204          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
12205          isPSHUFDMask(M, VT) ||
12206          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
12207          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
12208          isPALIGNRMask(M, VT, Subtarget) ||
12209          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
12210          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
12211          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
12212          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
12213}
12214
12215bool
12216X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
12217                                          EVT VT) const {
12218  unsigned NumElts = VT.getVectorNumElements();
12219  // FIXME: This collection of masks seems suspect.
12220  if (NumElts == 2)
12221    return true;
12222  if (NumElts == 4 && VT.is128BitVector()) {
12223    return (isMOVLMask(Mask, VT)  ||
12224            isCommutedMOVLMask(Mask, VT, true) ||
12225            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
12226            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
12227  }
12228  return false;
12229}
12230
12231//===----------------------------------------------------------------------===//
12232//                           X86 Scheduler Hooks
12233//===----------------------------------------------------------------------===//
12234
12235/// Utility function to emit xbegin specifying the start of an RTM region.
12236static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
12237                                     const TargetInstrInfo *TII) {
12238  DebugLoc DL = MI->getDebugLoc();
12239
12240  const BasicBlock *BB = MBB->getBasicBlock();
12241  MachineFunction::iterator I = MBB;
12242  ++I;
12243
12244  // For the v = xbegin(), we generate
12245  //
12246  // thisMBB:
12247  //  xbegin sinkMBB
12248  //
12249  // mainMBB:
12250  //  eax = -1
12251  //
12252  // sinkMBB:
12253  //  v = eax
12254
12255  MachineBasicBlock *thisMBB = MBB;
12256  MachineFunction *MF = MBB->getParent();
12257  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12258  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12259  MF->insert(I, mainMBB);
12260  MF->insert(I, sinkMBB);
12261
12262  // Transfer the remainder of BB and its successor edges to sinkMBB.
12263  sinkMBB->splice(sinkMBB->begin(), MBB,
12264                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
12265  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12266
12267  // thisMBB:
12268  //  xbegin sinkMBB
12269  //  # fallthrough to mainMBB
12270  //  # abortion to sinkMBB
12271  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
12272  thisMBB->addSuccessor(mainMBB);
12273  thisMBB->addSuccessor(sinkMBB);
12274
12275  // mainMBB:
12276  //  EAX = -1
12277  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
12278  mainMBB->addSuccessor(sinkMBB);
12279
12280  // sinkMBB:
12281  // EAX is live into the sinkMBB
12282  sinkMBB->addLiveIn(X86::EAX);
12283  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12284          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
12285    .addReg(X86::EAX);
12286
12287  MI->eraseFromParent();
12288  return sinkMBB;
12289}
12290
12291// Get CMPXCHG opcode for the specified data type.
12292static unsigned getCmpXChgOpcode(EVT VT) {
12293  switch (VT.getSimpleVT().SimpleTy) {
12294  case MVT::i8:  return X86::LCMPXCHG8;
12295  case MVT::i16: return X86::LCMPXCHG16;
12296  case MVT::i32: return X86::LCMPXCHG32;
12297  case MVT::i64: return X86::LCMPXCHG64;
12298  default:
12299    break;
12300  }
12301  llvm_unreachable("Invalid operand size!");
12302}
12303
12304// Get LOAD opcode for the specified data type.
12305static unsigned getLoadOpcode(EVT VT) {
12306  switch (VT.getSimpleVT().SimpleTy) {
12307  case MVT::i8:  return X86::MOV8rm;
12308  case MVT::i16: return X86::MOV16rm;
12309  case MVT::i32: return X86::MOV32rm;
12310  case MVT::i64: return X86::MOV64rm;
12311  default:
12312    break;
12313  }
12314  llvm_unreachable("Invalid operand size!");
12315}
12316
12317// Get opcode of the non-atomic one from the specified atomic instruction.
12318static unsigned getNonAtomicOpcode(unsigned Opc) {
12319  switch (Opc) {
12320  case X86::ATOMAND8:  return X86::AND8rr;
12321  case X86::ATOMAND16: return X86::AND16rr;
12322  case X86::ATOMAND32: return X86::AND32rr;
12323  case X86::ATOMAND64: return X86::AND64rr;
12324  case X86::ATOMOR8:   return X86::OR8rr;
12325  case X86::ATOMOR16:  return X86::OR16rr;
12326  case X86::ATOMOR32:  return X86::OR32rr;
12327  case X86::ATOMOR64:  return X86::OR64rr;
12328  case X86::ATOMXOR8:  return X86::XOR8rr;
12329  case X86::ATOMXOR16: return X86::XOR16rr;
12330  case X86::ATOMXOR32: return X86::XOR32rr;
12331  case X86::ATOMXOR64: return X86::XOR64rr;
12332  }
12333  llvm_unreachable("Unhandled atomic-load-op opcode!");
12334}
12335
12336// Get opcode of the non-atomic one from the specified atomic instruction with
12337// extra opcode.
12338static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
12339                                               unsigned &ExtraOpc) {
12340  switch (Opc) {
12341  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
12342  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
12343  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
12344  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
12345  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
12346  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
12347  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
12348  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
12349  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
12350  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
12351  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
12352  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
12353  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
12354  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
12355  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
12356  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
12357  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
12358  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
12359  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
12360  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
12361  }
12362  llvm_unreachable("Unhandled atomic-load-op opcode!");
12363}
12364
12365// Get opcode of the non-atomic one from the specified atomic instruction for
12366// 64-bit data type on 32-bit target.
12367static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
12368  switch (Opc) {
12369  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
12370  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
12371  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
12372  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
12373  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
12374  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
12375  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
12376  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
12377  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
12378  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
12379  }
12380  llvm_unreachable("Unhandled atomic-load-op opcode!");
12381}
12382
12383// Get opcode of the non-atomic one from the specified atomic instruction for
12384// 64-bit data type on 32-bit target with extra opcode.
12385static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
12386                                                   unsigned &HiOpc,
12387                                                   unsigned &ExtraOpc) {
12388  switch (Opc) {
12389  case X86::ATOMNAND6432:
12390    ExtraOpc = X86::NOT32r;
12391    HiOpc = X86::AND32rr;
12392    return X86::AND32rr;
12393  }
12394  llvm_unreachable("Unhandled atomic-load-op opcode!");
12395}
12396
12397// Get pseudo CMOV opcode from the specified data type.
12398static unsigned getPseudoCMOVOpc(EVT VT) {
12399  switch (VT.getSimpleVT().SimpleTy) {
12400  case MVT::i8:  return X86::CMOV_GR8;
12401  case MVT::i16: return X86::CMOV_GR16;
12402  case MVT::i32: return X86::CMOV_GR32;
12403  default:
12404    break;
12405  }
12406  llvm_unreachable("Unknown CMOV opcode!");
12407}
12408
12409// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
12410// They will be translated into a spin-loop or compare-exchange loop from
12411//
12412//    ...
12413//    dst = atomic-fetch-op MI.addr, MI.val
12414//    ...
12415//
12416// to
12417//
12418//    ...
12419//    EAX = LOAD MI.addr
12420// loop:
12421//    t1 = OP MI.val, EAX
12422//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
12423//    JNE loop
12424// sink:
12425//    dst = EAX
12426//    ...
12427MachineBasicBlock *
12428X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
12429                                       MachineBasicBlock *MBB) const {
12430  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12431  DebugLoc DL = MI->getDebugLoc();
12432
12433  MachineFunction *MF = MBB->getParent();
12434  MachineRegisterInfo &MRI = MF->getRegInfo();
12435
12436  const BasicBlock *BB = MBB->getBasicBlock();
12437  MachineFunction::iterator I = MBB;
12438  ++I;
12439
12440  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
12441         "Unexpected number of operands");
12442
12443  assert(MI->hasOneMemOperand() &&
12444         "Expected atomic-load-op to have one memoperand");
12445
12446  // Memory Reference
12447  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
12448  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
12449
12450  unsigned DstReg, SrcReg;
12451  unsigned MemOpndSlot;
12452
12453  unsigned CurOp = 0;
12454
12455  DstReg = MI->getOperand(CurOp++).getReg();
12456  MemOpndSlot = CurOp;
12457  CurOp += X86::AddrNumOperands;
12458  SrcReg = MI->getOperand(CurOp++).getReg();
12459
12460  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
12461  MVT::SimpleValueType VT = *RC->vt_begin();
12462  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
12463
12464  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
12465  unsigned LOADOpc = getLoadOpcode(VT);
12466
12467  // For the atomic load-arith operator, we generate
12468  //
12469  //  thisMBB:
12470  //    EAX = LOAD [MI.addr]
12471  //  mainMBB:
12472  //    t1 = OP MI.val, EAX
12473  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
12474  //    JNE mainMBB
12475  //  sinkMBB:
12476
12477  MachineBasicBlock *thisMBB = MBB;
12478  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12479  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12480  MF->insert(I, mainMBB);
12481  MF->insert(I, sinkMBB);
12482
12483  MachineInstrBuilder MIB;
12484
12485  // Transfer the remainder of BB and its successor edges to sinkMBB.
12486  sinkMBB->splice(sinkMBB->begin(), MBB,
12487                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
12488  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12489
12490  // thisMBB:
12491  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
12492  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12493    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12494  MIB.setMemRefs(MMOBegin, MMOEnd);
12495
12496  thisMBB->addSuccessor(mainMBB);
12497
12498  // mainMBB:
12499  MachineBasicBlock *origMainMBB = mainMBB;
12500  mainMBB->addLiveIn(AccPhyReg);
12501
12502  // Copy AccPhyReg as it is used more than once.
12503  unsigned AccReg = MRI.createVirtualRegister(RC);
12504  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
12505    .addReg(AccPhyReg);
12506
12507  unsigned t1 = MRI.createVirtualRegister(RC);
12508  unsigned Opc = MI->getOpcode();
12509  switch (Opc) {
12510  default:
12511    llvm_unreachable("Unhandled atomic-load-op opcode!");
12512  case X86::ATOMAND8:
12513  case X86::ATOMAND16:
12514  case X86::ATOMAND32:
12515  case X86::ATOMAND64:
12516  case X86::ATOMOR8:
12517  case X86::ATOMOR16:
12518  case X86::ATOMOR32:
12519  case X86::ATOMOR64:
12520  case X86::ATOMXOR8:
12521  case X86::ATOMXOR16:
12522  case X86::ATOMXOR32:
12523  case X86::ATOMXOR64: {
12524    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
12525    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
12526      .addReg(AccReg);
12527    break;
12528  }
12529  case X86::ATOMNAND8:
12530  case X86::ATOMNAND16:
12531  case X86::ATOMNAND32:
12532  case X86::ATOMNAND64: {
12533    unsigned t2 = MRI.createVirtualRegister(RC);
12534    unsigned NOTOpc;
12535    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
12536    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
12537      .addReg(AccReg);
12538    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
12539    break;
12540  }
12541  case X86::ATOMMAX8:
12542  case X86::ATOMMAX16:
12543  case X86::ATOMMAX32:
12544  case X86::ATOMMAX64:
12545  case X86::ATOMMIN8:
12546  case X86::ATOMMIN16:
12547  case X86::ATOMMIN32:
12548  case X86::ATOMMIN64:
12549  case X86::ATOMUMAX8:
12550  case X86::ATOMUMAX16:
12551  case X86::ATOMUMAX32:
12552  case X86::ATOMUMAX64:
12553  case X86::ATOMUMIN8:
12554  case X86::ATOMUMIN16:
12555  case X86::ATOMUMIN32:
12556  case X86::ATOMUMIN64: {
12557    unsigned CMPOpc;
12558    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
12559
12560    BuildMI(mainMBB, DL, TII->get(CMPOpc))
12561      .addReg(SrcReg)
12562      .addReg(AccReg);
12563
12564    if (Subtarget->hasCMov()) {
12565      if (VT != MVT::i8) {
12566        // Native support
12567        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
12568          .addReg(SrcReg)
12569          .addReg(AccReg);
12570      } else {
12571        // Promote i8 to i32 to use CMOV32
12572        const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
12573        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
12574        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
12575        unsigned t2 = MRI.createVirtualRegister(RC32);
12576
12577        unsigned Undef = MRI.createVirtualRegister(RC32);
12578        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
12579
12580        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
12581          .addReg(Undef)
12582          .addReg(SrcReg)
12583          .addImm(X86::sub_8bit);
12584        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
12585          .addReg(Undef)
12586          .addReg(AccReg)
12587          .addImm(X86::sub_8bit);
12588
12589        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
12590          .addReg(SrcReg32)
12591          .addReg(AccReg32);
12592
12593        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
12594          .addReg(t2, 0, X86::sub_8bit);
12595      }
12596    } else {
12597      // Use pseudo select and lower them.
12598      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
12599             "Invalid atomic-load-op transformation!");
12600      unsigned SelOpc = getPseudoCMOVOpc(VT);
12601      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
12602      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
12603      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
12604              .addReg(SrcReg).addReg(AccReg)
12605              .addImm(CC);
12606      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12607    }
12608    break;
12609  }
12610  }
12611
12612  // Copy AccPhyReg back from virtual register.
12613  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
12614    .addReg(AccReg);
12615
12616  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
12617  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12618    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12619  MIB.addReg(t1);
12620  MIB.setMemRefs(MMOBegin, MMOEnd);
12621
12622  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
12623
12624  mainMBB->addSuccessor(origMainMBB);
12625  mainMBB->addSuccessor(sinkMBB);
12626
12627  // sinkMBB:
12628  sinkMBB->addLiveIn(AccPhyReg);
12629
12630  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12631          TII->get(TargetOpcode::COPY), DstReg)
12632    .addReg(AccPhyReg);
12633
12634  MI->eraseFromParent();
12635  return sinkMBB;
12636}
12637
12638// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
12639// instructions. They will be translated into a spin-loop or compare-exchange
12640// loop from
12641//
12642//    ...
12643//    dst = atomic-fetch-op MI.addr, MI.val
12644//    ...
12645//
12646// to
12647//
12648//    ...
12649//    EAX = LOAD [MI.addr + 0]
12650//    EDX = LOAD [MI.addr + 4]
12651// loop:
12652//    EBX = OP MI.val.lo, EAX
12653//    ECX = OP MI.val.hi, EDX
12654//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
12655//    JNE loop
12656// sink:
12657//    dst = EDX:EAX
12658//    ...
12659MachineBasicBlock *
12660X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
12661                                           MachineBasicBlock *MBB) const {
12662  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12663  DebugLoc DL = MI->getDebugLoc();
12664
12665  MachineFunction *MF = MBB->getParent();
12666  MachineRegisterInfo &MRI = MF->getRegInfo();
12667
12668  const BasicBlock *BB = MBB->getBasicBlock();
12669  MachineFunction::iterator I = MBB;
12670  ++I;
12671
12672  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
12673         "Unexpected number of operands");
12674
12675  assert(MI->hasOneMemOperand() &&
12676         "Expected atomic-load-op32 to have one memoperand");
12677
12678  // Memory Reference
12679  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
12680  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
12681
12682  unsigned DstLoReg, DstHiReg;
12683  unsigned SrcLoReg, SrcHiReg;
12684  unsigned MemOpndSlot;
12685
12686  unsigned CurOp = 0;
12687
12688  DstLoReg = MI->getOperand(CurOp++).getReg();
12689  DstHiReg = MI->getOperand(CurOp++).getReg();
12690  MemOpndSlot = CurOp;
12691  CurOp += X86::AddrNumOperands;
12692  SrcLoReg = MI->getOperand(CurOp++).getReg();
12693  SrcHiReg = MI->getOperand(CurOp++).getReg();
12694
12695  const TargetRegisterClass *RC = &X86::GR32RegClass;
12696  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
12697
12698  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
12699  unsigned LOADOpc = X86::MOV32rm;
12700
12701  // For the atomic load-arith operator, we generate
12702  //
12703  //  thisMBB:
12704  //    EAX = LOAD [MI.addr + 0]
12705  //    EDX = LOAD [MI.addr + 4]
12706  //  mainMBB:
12707  //    EBX = OP MI.vallo, EAX
12708  //    ECX = OP MI.valhi, EDX
12709  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
12710  //    JNE mainMBB
12711  //  sinkMBB:
12712
12713  MachineBasicBlock *thisMBB = MBB;
12714  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12715  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12716  MF->insert(I, mainMBB);
12717  MF->insert(I, sinkMBB);
12718
12719  MachineInstrBuilder MIB;
12720
12721  // Transfer the remainder of BB and its successor edges to sinkMBB.
12722  sinkMBB->splice(sinkMBB->begin(), MBB,
12723                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
12724  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12725
12726  // thisMBB:
12727  // Lo
12728  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
12729  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12730    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12731  MIB.setMemRefs(MMOBegin, MMOEnd);
12732  // Hi
12733  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
12734  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
12735    if (i == X86::AddrDisp)
12736      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
12737    else
12738      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12739  }
12740  MIB.setMemRefs(MMOBegin, MMOEnd);
12741
12742  thisMBB->addSuccessor(mainMBB);
12743
12744  // mainMBB:
12745  MachineBasicBlock *origMainMBB = mainMBB;
12746  mainMBB->addLiveIn(X86::EAX);
12747  mainMBB->addLiveIn(X86::EDX);
12748
12749  // Copy EDX:EAX as they are used more than once.
12750  unsigned LoReg = MRI.createVirtualRegister(RC);
12751  unsigned HiReg = MRI.createVirtualRegister(RC);
12752  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
12753  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
12754
12755  unsigned t1L = MRI.createVirtualRegister(RC);
12756  unsigned t1H = MRI.createVirtualRegister(RC);
12757
12758  unsigned Opc = MI->getOpcode();
12759  switch (Opc) {
12760  default:
12761    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
12762  case X86::ATOMAND6432:
12763  case X86::ATOMOR6432:
12764  case X86::ATOMXOR6432:
12765  case X86::ATOMADD6432:
12766  case X86::ATOMSUB6432: {
12767    unsigned HiOpc;
12768    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
12769    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
12770    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
12771    break;
12772  }
12773  case X86::ATOMNAND6432: {
12774    unsigned HiOpc, NOTOpc;
12775    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
12776    unsigned t2L = MRI.createVirtualRegister(RC);
12777    unsigned t2H = MRI.createVirtualRegister(RC);
12778    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
12779    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
12780    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
12781    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
12782    break;
12783  }
12784  case X86::ATOMMAX6432:
12785  case X86::ATOMMIN6432:
12786  case X86::ATOMUMAX6432:
12787  case X86::ATOMUMIN6432: {
12788    unsigned HiOpc;
12789    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
12790    unsigned cL = MRI.createVirtualRegister(RC8);
12791    unsigned cH = MRI.createVirtualRegister(RC8);
12792    unsigned cL32 = MRI.createVirtualRegister(RC);
12793    unsigned cH32 = MRI.createVirtualRegister(RC);
12794    unsigned cc = MRI.createVirtualRegister(RC);
12795    // cl := cmp src_lo, lo
12796    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
12797      .addReg(SrcLoReg).addReg(LoReg);
12798    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
12799    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
12800    // ch := cmp src_hi, hi
12801    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
12802      .addReg(SrcHiReg).addReg(HiReg);
12803    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
12804    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
12805    // cc := if (src_hi == hi) ? cl : ch;
12806    if (Subtarget->hasCMov()) {
12807      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
12808        .addReg(cH32).addReg(cL32);
12809    } else {
12810      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
12811              .addReg(cH32).addReg(cL32)
12812              .addImm(X86::COND_E);
12813      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12814    }
12815    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
12816    if (Subtarget->hasCMov()) {
12817      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
12818        .addReg(SrcLoReg).addReg(LoReg);
12819      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
12820        .addReg(SrcHiReg).addReg(HiReg);
12821    } else {
12822      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
12823              .addReg(SrcLoReg).addReg(LoReg)
12824              .addImm(X86::COND_NE);
12825      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12826      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
12827              .addReg(SrcHiReg).addReg(HiReg)
12828              .addImm(X86::COND_NE);
12829      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12830    }
12831    break;
12832  }
12833  case X86::ATOMSWAP6432: {
12834    unsigned HiOpc;
12835    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
12836    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
12837    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
12838    break;
12839  }
12840  }
12841
12842  // Copy EDX:EAX back from HiReg:LoReg
12843  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
12844  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
12845  // Copy ECX:EBX from t1H:t1L
12846  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
12847  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
12848
12849  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
12850  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12851    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12852  MIB.setMemRefs(MMOBegin, MMOEnd);
12853
12854  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
12855
12856  mainMBB->addSuccessor(origMainMBB);
12857  mainMBB->addSuccessor(sinkMBB);
12858
12859  // sinkMBB:
12860  sinkMBB->addLiveIn(X86::EAX);
12861  sinkMBB->addLiveIn(X86::EDX);
12862
12863  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12864          TII->get(TargetOpcode::COPY), DstLoReg)
12865    .addReg(X86::EAX);
12866  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12867          TII->get(TargetOpcode::COPY), DstHiReg)
12868    .addReg(X86::EDX);
12869
12870  MI->eraseFromParent();
12871  return sinkMBB;
12872}
12873
12874// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
12875// or XMM0_V32I8 in AVX all of this code can be replaced with that
12876// in the .td file.
12877static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
12878                                       const TargetInstrInfo *TII) {
12879  unsigned Opc;
12880  switch (MI->getOpcode()) {
12881  default: llvm_unreachable("illegal opcode!");
12882  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
12883  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
12884  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
12885  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
12886  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
12887  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
12888  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
12889  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
12890  }
12891
12892  DebugLoc dl = MI->getDebugLoc();
12893  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
12894
12895  unsigned NumArgs = MI->getNumOperands();
12896  for (unsigned i = 1; i < NumArgs; ++i) {
12897    MachineOperand &Op = MI->getOperand(i);
12898    if (!(Op.isReg() && Op.isImplicit()))
12899      MIB.addOperand(Op);
12900  }
12901  if (MI->hasOneMemOperand())
12902    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
12903
12904  BuildMI(*BB, MI, dl,
12905    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
12906    .addReg(X86::XMM0);
12907
12908  MI->eraseFromParent();
12909  return BB;
12910}
12911
12912// FIXME: Custom handling because TableGen doesn't support multiple implicit
12913// defs in an instruction pattern
12914static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
12915                                       const TargetInstrInfo *TII) {
12916  unsigned Opc;
12917  switch (MI->getOpcode()) {
12918  default: llvm_unreachable("illegal opcode!");
12919  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
12920  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
12921  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
12922  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
12923  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
12924  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
12925  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
12926  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
12927  }
12928
12929  DebugLoc dl = MI->getDebugLoc();
12930  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
12931
12932  unsigned NumArgs = MI->getNumOperands(); // remove the results
12933  for (unsigned i = 1; i < NumArgs; ++i) {
12934    MachineOperand &Op = MI->getOperand(i);
12935    if (!(Op.isReg() && Op.isImplicit()))
12936      MIB.addOperand(Op);
12937  }
12938  if (MI->hasOneMemOperand())
12939    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
12940
12941  BuildMI(*BB, MI, dl,
12942    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
12943    .addReg(X86::ECX);
12944
12945  MI->eraseFromParent();
12946  return BB;
12947}
12948
12949static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
12950                                       const TargetInstrInfo *TII,
12951                                       const X86Subtarget* Subtarget) {
12952  DebugLoc dl = MI->getDebugLoc();
12953
12954  // Address into RAX/EAX, other two args into ECX, EDX.
12955  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
12956  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
12957  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
12958  for (int i = 0; i < X86::AddrNumOperands; ++i)
12959    MIB.addOperand(MI->getOperand(i));
12960
12961  unsigned ValOps = X86::AddrNumOperands;
12962  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
12963    .addReg(MI->getOperand(ValOps).getReg());
12964  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
12965    .addReg(MI->getOperand(ValOps+1).getReg());
12966
12967  // The instruction doesn't actually take any operands though.
12968  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
12969
12970  MI->eraseFromParent(); // The pseudo is gone now.
12971  return BB;
12972}
12973
12974MachineBasicBlock *
12975X86TargetLowering::EmitVAARG64WithCustomInserter(
12976                   MachineInstr *MI,
12977                   MachineBasicBlock *MBB) const {
12978  // Emit va_arg instruction on X86-64.
12979
12980  // Operands to this pseudo-instruction:
12981  // 0  ) Output        : destination address (reg)
12982  // 1-5) Input         : va_list address (addr, i64mem)
12983  // 6  ) ArgSize       : Size (in bytes) of vararg type
12984  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
12985  // 8  ) Align         : Alignment of type
12986  // 9  ) EFLAGS (implicit-def)
12987
12988  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
12989  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
12990
12991  unsigned DestReg = MI->getOperand(0).getReg();
12992  MachineOperand &Base = MI->getOperand(1);
12993  MachineOperand &Scale = MI->getOperand(2);
12994  MachineOperand &Index = MI->getOperand(3);
12995  MachineOperand &Disp = MI->getOperand(4);
12996  MachineOperand &Segment = MI->getOperand(5);
12997  unsigned ArgSize = MI->getOperand(6).getImm();
12998  unsigned ArgMode = MI->getOperand(7).getImm();
12999  unsigned Align = MI->getOperand(8).getImm();
13000
13001  // Memory Reference
13002  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
13003  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13004  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13005
13006  // Machine Information
13007  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13008  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
13009  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
13010  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
13011  DebugLoc DL = MI->getDebugLoc();
13012
13013  // struct va_list {
13014  //   i32   gp_offset
13015  //   i32   fp_offset
13016  //   i64   overflow_area (address)
13017  //   i64   reg_save_area (address)
13018  // }
13019  // sizeof(va_list) = 24
13020  // alignment(va_list) = 8
13021
13022  unsigned TotalNumIntRegs = 6;
13023  unsigned TotalNumXMMRegs = 8;
13024  bool UseGPOffset = (ArgMode == 1);
13025  bool UseFPOffset = (ArgMode == 2);
13026  unsigned MaxOffset = TotalNumIntRegs * 8 +
13027                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
13028
13029  /* Align ArgSize to a multiple of 8 */
13030  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
13031  bool NeedsAlign = (Align > 8);
13032
13033  MachineBasicBlock *thisMBB = MBB;
13034  MachineBasicBlock *overflowMBB;
13035  MachineBasicBlock *offsetMBB;
13036  MachineBasicBlock *endMBB;
13037
13038  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
13039  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
13040  unsigned OffsetReg = 0;
13041
13042  if (!UseGPOffset && !UseFPOffset) {
13043    // If we only pull from the overflow region, we don't create a branch.
13044    // We don't need to alter control flow.
13045    OffsetDestReg = 0; // unused
13046    OverflowDestReg = DestReg;
13047
13048    offsetMBB = NULL;
13049    overflowMBB = thisMBB;
13050    endMBB = thisMBB;
13051  } else {
13052    // First emit code to check if gp_offset (or fp_offset) is below the bound.
13053    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
13054    // If not, pull from overflow_area. (branch to overflowMBB)
13055    //
13056    //       thisMBB
13057    //         |     .
13058    //         |        .
13059    //     offsetMBB   overflowMBB
13060    //         |        .
13061    //         |     .
13062    //        endMBB
13063
13064    // Registers for the PHI in endMBB
13065    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
13066    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
13067
13068    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
13069    MachineFunction *MF = MBB->getParent();
13070    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13071    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13072    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13073
13074    MachineFunction::iterator MBBIter = MBB;
13075    ++MBBIter;
13076
13077    // Insert the new basic blocks
13078    MF->insert(MBBIter, offsetMBB);
13079    MF->insert(MBBIter, overflowMBB);
13080    MF->insert(MBBIter, endMBB);
13081
13082    // Transfer the remainder of MBB and its successor edges to endMBB.
13083    endMBB->splice(endMBB->begin(), thisMBB,
13084                    llvm::next(MachineBasicBlock::iterator(MI)),
13085                    thisMBB->end());
13086    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
13087
13088    // Make offsetMBB and overflowMBB successors of thisMBB
13089    thisMBB->addSuccessor(offsetMBB);
13090    thisMBB->addSuccessor(overflowMBB);
13091
13092    // endMBB is a successor of both offsetMBB and overflowMBB
13093    offsetMBB->addSuccessor(endMBB);
13094    overflowMBB->addSuccessor(endMBB);
13095
13096    // Load the offset value into a register
13097    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
13098    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
13099      .addOperand(Base)
13100      .addOperand(Scale)
13101      .addOperand(Index)
13102      .addDisp(Disp, UseFPOffset ? 4 : 0)
13103      .addOperand(Segment)
13104      .setMemRefs(MMOBegin, MMOEnd);
13105
13106    // Check if there is enough room left to pull this argument.
13107    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
13108      .addReg(OffsetReg)
13109      .addImm(MaxOffset + 8 - ArgSizeA8);
13110
13111    // Branch to "overflowMBB" if offset >= max
13112    // Fall through to "offsetMBB" otherwise
13113    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
13114      .addMBB(overflowMBB);
13115  }
13116
13117  // In offsetMBB, emit code to use the reg_save_area.
13118  if (offsetMBB) {
13119    assert(OffsetReg != 0);
13120
13121    // Read the reg_save_area address.
13122    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
13123    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
13124      .addOperand(Base)
13125      .addOperand(Scale)
13126      .addOperand(Index)
13127      .addDisp(Disp, 16)
13128      .addOperand(Segment)
13129      .setMemRefs(MMOBegin, MMOEnd);
13130
13131    // Zero-extend the offset
13132    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
13133      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
13134        .addImm(0)
13135        .addReg(OffsetReg)
13136        .addImm(X86::sub_32bit);
13137
13138    // Add the offset to the reg_save_area to get the final address.
13139    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
13140      .addReg(OffsetReg64)
13141      .addReg(RegSaveReg);
13142
13143    // Compute the offset for the next argument
13144    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
13145    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
13146      .addReg(OffsetReg)
13147      .addImm(UseFPOffset ? 16 : 8);
13148
13149    // Store it back into the va_list.
13150    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
13151      .addOperand(Base)
13152      .addOperand(Scale)
13153      .addOperand(Index)
13154      .addDisp(Disp, UseFPOffset ? 4 : 0)
13155      .addOperand(Segment)
13156      .addReg(NextOffsetReg)
13157      .setMemRefs(MMOBegin, MMOEnd);
13158
13159    // Jump to endMBB
13160    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
13161      .addMBB(endMBB);
13162  }
13163
13164  //
13165  // Emit code to use overflow area
13166  //
13167
13168  // Load the overflow_area address into a register.
13169  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
13170  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
13171    .addOperand(Base)
13172    .addOperand(Scale)
13173    .addOperand(Index)
13174    .addDisp(Disp, 8)
13175    .addOperand(Segment)
13176    .setMemRefs(MMOBegin, MMOEnd);
13177
13178  // If we need to align it, do so. Otherwise, just copy the address
13179  // to OverflowDestReg.
13180  if (NeedsAlign) {
13181    // Align the overflow address
13182    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
13183    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
13184
13185    // aligned_addr = (addr + (align-1)) & ~(align-1)
13186    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
13187      .addReg(OverflowAddrReg)
13188      .addImm(Align-1);
13189
13190    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
13191      .addReg(TmpReg)
13192      .addImm(~(uint64_t)(Align-1));
13193  } else {
13194    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
13195      .addReg(OverflowAddrReg);
13196  }
13197
13198  // Compute the next overflow address after this argument.
13199  // (the overflow address should be kept 8-byte aligned)
13200  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
13201  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
13202    .addReg(OverflowDestReg)
13203    .addImm(ArgSizeA8);
13204
13205  // Store the new overflow address.
13206  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
13207    .addOperand(Base)
13208    .addOperand(Scale)
13209    .addOperand(Index)
13210    .addDisp(Disp, 8)
13211    .addOperand(Segment)
13212    .addReg(NextAddrReg)
13213    .setMemRefs(MMOBegin, MMOEnd);
13214
13215  // If we branched, emit the PHI to the front of endMBB.
13216  if (offsetMBB) {
13217    BuildMI(*endMBB, endMBB->begin(), DL,
13218            TII->get(X86::PHI), DestReg)
13219      .addReg(OffsetDestReg).addMBB(offsetMBB)
13220      .addReg(OverflowDestReg).addMBB(overflowMBB);
13221  }
13222
13223  // Erase the pseudo instruction
13224  MI->eraseFromParent();
13225
13226  return endMBB;
13227}
13228
13229MachineBasicBlock *
13230X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
13231                                                 MachineInstr *MI,
13232                                                 MachineBasicBlock *MBB) const {
13233  // Emit code to save XMM registers to the stack. The ABI says that the
13234  // number of registers to save is given in %al, so it's theoretically
13235  // possible to do an indirect jump trick to avoid saving all of them,
13236  // however this code takes a simpler approach and just executes all
13237  // of the stores if %al is non-zero. It's less code, and it's probably
13238  // easier on the hardware branch predictor, and stores aren't all that
13239  // expensive anyway.
13240
13241  // Create the new basic blocks. One block contains all the XMM stores,
13242  // and one block is the final destination regardless of whether any
13243  // stores were performed.
13244  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
13245  MachineFunction *F = MBB->getParent();
13246  MachineFunction::iterator MBBIter = MBB;
13247  ++MBBIter;
13248  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
13249  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
13250  F->insert(MBBIter, XMMSaveMBB);
13251  F->insert(MBBIter, EndMBB);
13252
13253  // Transfer the remainder of MBB and its successor edges to EndMBB.
13254  EndMBB->splice(EndMBB->begin(), MBB,
13255                 llvm::next(MachineBasicBlock::iterator(MI)),
13256                 MBB->end());
13257  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
13258
13259  // The original block will now fall through to the XMM save block.
13260  MBB->addSuccessor(XMMSaveMBB);
13261  // The XMMSaveMBB will fall through to the end block.
13262  XMMSaveMBB->addSuccessor(EndMBB);
13263
13264  // Now add the instructions.
13265  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13266  DebugLoc DL = MI->getDebugLoc();
13267
13268  unsigned CountReg = MI->getOperand(0).getReg();
13269  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
13270  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
13271
13272  if (!Subtarget->isTargetWin64()) {
13273    // If %al is 0, branch around the XMM save block.
13274    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
13275    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
13276    MBB->addSuccessor(EndMBB);
13277  }
13278
13279  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
13280  // In the XMM save block, save all the XMM argument registers.
13281  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
13282    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
13283    MachineMemOperand *MMO =
13284      F->getMachineMemOperand(
13285          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
13286        MachineMemOperand::MOStore,
13287        /*Size=*/16, /*Align=*/16);
13288    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
13289      .addFrameIndex(RegSaveFrameIndex)
13290      .addImm(/*Scale=*/1)
13291      .addReg(/*IndexReg=*/0)
13292      .addImm(/*Disp=*/Offset)
13293      .addReg(/*Segment=*/0)
13294      .addReg(MI->getOperand(i).getReg())
13295      .addMemOperand(MMO);
13296  }
13297
13298  MI->eraseFromParent();   // The pseudo instruction is gone now.
13299
13300  return EndMBB;
13301}
13302
13303// The EFLAGS operand of SelectItr might be missing a kill marker
13304// because there were multiple uses of EFLAGS, and ISel didn't know
13305// which to mark. Figure out whether SelectItr should have had a
13306// kill marker, and set it if it should. Returns the correct kill
13307// marker value.
13308static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
13309                                     MachineBasicBlock* BB,
13310                                     const TargetRegisterInfo* TRI) {
13311  // Scan forward through BB for a use/def of EFLAGS.
13312  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
13313  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
13314    const MachineInstr& mi = *miI;
13315    if (mi.readsRegister(X86::EFLAGS))
13316      return false;
13317    if (mi.definesRegister(X86::EFLAGS))
13318      break; // Should have kill-flag - update below.
13319  }
13320
13321  // If we hit the end of the block, check whether EFLAGS is live into a
13322  // successor.
13323  if (miI == BB->end()) {
13324    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
13325                                          sEnd = BB->succ_end();
13326         sItr != sEnd; ++sItr) {
13327      MachineBasicBlock* succ = *sItr;
13328      if (succ->isLiveIn(X86::EFLAGS))
13329        return false;
13330    }
13331  }
13332
13333  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
13334  // out. SelectMI should have a kill flag on EFLAGS.
13335  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
13336  return true;
13337}
13338
13339MachineBasicBlock *
13340X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
13341                                     MachineBasicBlock *BB) const {
13342  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13343  DebugLoc DL = MI->getDebugLoc();
13344
13345  // To "insert" a SELECT_CC instruction, we actually have to insert the
13346  // diamond control-flow pattern.  The incoming instruction knows the
13347  // destination vreg to set, the condition code register to branch on, the
13348  // true/false values to select between, and a branch opcode to use.
13349  const BasicBlock *LLVM_BB = BB->getBasicBlock();
13350  MachineFunction::iterator It = BB;
13351  ++It;
13352
13353  //  thisMBB:
13354  //  ...
13355  //   TrueVal = ...
13356  //   cmpTY ccX, r1, r2
13357  //   bCC copy1MBB
13358  //   fallthrough --> copy0MBB
13359  MachineBasicBlock *thisMBB = BB;
13360  MachineFunction *F = BB->getParent();
13361  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
13362  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
13363  F->insert(It, copy0MBB);
13364  F->insert(It, sinkMBB);
13365
13366  // If the EFLAGS register isn't dead in the terminator, then claim that it's
13367  // live into the sink and copy blocks.
13368  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
13369  if (!MI->killsRegister(X86::EFLAGS) &&
13370      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
13371    copy0MBB->addLiveIn(X86::EFLAGS);
13372    sinkMBB->addLiveIn(X86::EFLAGS);
13373  }
13374
13375  // Transfer the remainder of BB and its successor edges to sinkMBB.
13376  sinkMBB->splice(sinkMBB->begin(), BB,
13377                  llvm::next(MachineBasicBlock::iterator(MI)),
13378                  BB->end());
13379  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
13380
13381  // Add the true and fallthrough blocks as its successors.
13382  BB->addSuccessor(copy0MBB);
13383  BB->addSuccessor(sinkMBB);
13384
13385  // Create the conditional branch instruction.
13386  unsigned Opc =
13387    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
13388  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
13389
13390  //  copy0MBB:
13391  //   %FalseValue = ...
13392  //   # fallthrough to sinkMBB
13393  copy0MBB->addSuccessor(sinkMBB);
13394
13395  //  sinkMBB:
13396  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
13397  //  ...
13398  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13399          TII->get(X86::PHI), MI->getOperand(0).getReg())
13400    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
13401    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
13402
13403  MI->eraseFromParent();   // The pseudo instruction is gone now.
13404  return sinkMBB;
13405}
13406
13407MachineBasicBlock *
13408X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
13409                                        bool Is64Bit) const {
13410  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13411  DebugLoc DL = MI->getDebugLoc();
13412  MachineFunction *MF = BB->getParent();
13413  const BasicBlock *LLVM_BB = BB->getBasicBlock();
13414
13415  assert(getTargetMachine().Options.EnableSegmentedStacks);
13416
13417  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
13418  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
13419
13420  // BB:
13421  //  ... [Till the alloca]
13422  // If stacklet is not large enough, jump to mallocMBB
13423  //
13424  // bumpMBB:
13425  //  Allocate by subtracting from RSP
13426  //  Jump to continueMBB
13427  //
13428  // mallocMBB:
13429  //  Allocate by call to runtime
13430  //
13431  // continueMBB:
13432  //  ...
13433  //  [rest of original BB]
13434  //
13435
13436  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13437  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13438  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13439
13440  MachineRegisterInfo &MRI = MF->getRegInfo();
13441  const TargetRegisterClass *AddrRegClass =
13442    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
13443
13444  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
13445    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
13446    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
13447    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
13448    sizeVReg = MI->getOperand(1).getReg(),
13449    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
13450
13451  MachineFunction::iterator MBBIter = BB;
13452  ++MBBIter;
13453
13454  MF->insert(MBBIter, bumpMBB);
13455  MF->insert(MBBIter, mallocMBB);
13456  MF->insert(MBBIter, continueMBB);
13457
13458  continueMBB->splice(continueMBB->begin(), BB, llvm::next
13459                      (MachineBasicBlock::iterator(MI)), BB->end());
13460  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
13461
13462  // Add code to the main basic block to check if the stack limit has been hit,
13463  // and if so, jump to mallocMBB otherwise to bumpMBB.
13464  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
13465  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
13466    .addReg(tmpSPVReg).addReg(sizeVReg);
13467  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
13468    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
13469    .addReg(SPLimitVReg);
13470  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
13471
13472  // bumpMBB simply decreases the stack pointer, since we know the current
13473  // stacklet has enough space.
13474  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
13475    .addReg(SPLimitVReg);
13476  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
13477    .addReg(SPLimitVReg);
13478  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
13479
13480  // Calls into a routine in libgcc to allocate more space from the heap.
13481  const uint32_t *RegMask =
13482    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
13483  if (Is64Bit) {
13484    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
13485      .addReg(sizeVReg);
13486    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
13487      .addExternalSymbol("__morestack_allocate_stack_space")
13488      .addRegMask(RegMask)
13489      .addReg(X86::RDI, RegState::Implicit)
13490      .addReg(X86::RAX, RegState::ImplicitDefine);
13491  } else {
13492    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
13493      .addImm(12);
13494    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
13495    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
13496      .addExternalSymbol("__morestack_allocate_stack_space")
13497      .addRegMask(RegMask)
13498      .addReg(X86::EAX, RegState::ImplicitDefine);
13499  }
13500
13501  if (!Is64Bit)
13502    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
13503      .addImm(16);
13504
13505  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
13506    .addReg(Is64Bit ? X86::RAX : X86::EAX);
13507  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
13508
13509  // Set up the CFG correctly.
13510  BB->addSuccessor(bumpMBB);
13511  BB->addSuccessor(mallocMBB);
13512  mallocMBB->addSuccessor(continueMBB);
13513  bumpMBB->addSuccessor(continueMBB);
13514
13515  // Take care of the PHI nodes.
13516  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
13517          MI->getOperand(0).getReg())
13518    .addReg(mallocPtrVReg).addMBB(mallocMBB)
13519    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
13520
13521  // Delete the original pseudo instruction.
13522  MI->eraseFromParent();
13523
13524  // And we're done.
13525  return continueMBB;
13526}
13527
13528MachineBasicBlock *
13529X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
13530                                          MachineBasicBlock *BB) const {
13531  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13532  DebugLoc DL = MI->getDebugLoc();
13533
13534  assert(!Subtarget->isTargetEnvMacho());
13535
13536  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
13537  // non-trivial part is impdef of ESP.
13538
13539  if (Subtarget->isTargetWin64()) {
13540    if (Subtarget->isTargetCygMing()) {
13541      // ___chkstk(Mingw64):
13542      // Clobbers R10, R11, RAX and EFLAGS.
13543      // Updates RSP.
13544      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
13545        .addExternalSymbol("___chkstk")
13546        .addReg(X86::RAX, RegState::Implicit)
13547        .addReg(X86::RSP, RegState::Implicit)
13548        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
13549        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
13550        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
13551    } else {
13552      // __chkstk(MSVCRT): does not update stack pointer.
13553      // Clobbers R10, R11 and EFLAGS.
13554      // FIXME: RAX(allocated size) might be reused and not killed.
13555      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
13556        .addExternalSymbol("__chkstk")
13557        .addReg(X86::RAX, RegState::Implicit)
13558        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
13559      // RAX has the offset to subtracted from RSP.
13560      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
13561        .addReg(X86::RSP)
13562        .addReg(X86::RAX);
13563    }
13564  } else {
13565    const char *StackProbeSymbol =
13566      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
13567
13568    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
13569      .addExternalSymbol(StackProbeSymbol)
13570      .addReg(X86::EAX, RegState::Implicit)
13571      .addReg(X86::ESP, RegState::Implicit)
13572      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
13573      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
13574      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
13575  }
13576
13577  MI->eraseFromParent();   // The pseudo instruction is gone now.
13578  return BB;
13579}
13580
13581MachineBasicBlock *
13582X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
13583                                      MachineBasicBlock *BB) const {
13584  // This is pretty easy.  We're taking the value that we received from
13585  // our load from the relocation, sticking it in either RDI (x86-64)
13586  // or EAX and doing an indirect call.  The return value will then
13587  // be in the normal return register.
13588  const X86InstrInfo *TII
13589    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
13590  DebugLoc DL = MI->getDebugLoc();
13591  MachineFunction *F = BB->getParent();
13592
13593  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
13594  assert(MI->getOperand(3).isGlobal() && "This should be a global");
13595
13596  // Get a register mask for the lowered call.
13597  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
13598  // proper register mask.
13599  const uint32_t *RegMask =
13600    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
13601  if (Subtarget->is64Bit()) {
13602    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
13603                                      TII->get(X86::MOV64rm), X86::RDI)
13604    .addReg(X86::RIP)
13605    .addImm(0).addReg(0)
13606    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
13607                      MI->getOperand(3).getTargetFlags())
13608    .addReg(0);
13609    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
13610    addDirectMem(MIB, X86::RDI);
13611    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
13612  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
13613    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
13614                                      TII->get(X86::MOV32rm), X86::EAX)
13615    .addReg(0)
13616    .addImm(0).addReg(0)
13617    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
13618                      MI->getOperand(3).getTargetFlags())
13619    .addReg(0);
13620    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
13621    addDirectMem(MIB, X86::EAX);
13622    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
13623  } else {
13624    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
13625                                      TII->get(X86::MOV32rm), X86::EAX)
13626    .addReg(TII->getGlobalBaseReg(F))
13627    .addImm(0).addReg(0)
13628    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
13629                      MI->getOperand(3).getTargetFlags())
13630    .addReg(0);
13631    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
13632    addDirectMem(MIB, X86::EAX);
13633    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
13634  }
13635
13636  MI->eraseFromParent(); // The pseudo instruction is gone now.
13637  return BB;
13638}
13639
13640MachineBasicBlock *
13641X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
13642                                    MachineBasicBlock *MBB) const {
13643  DebugLoc DL = MI->getDebugLoc();
13644  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13645
13646  MachineFunction *MF = MBB->getParent();
13647  MachineRegisterInfo &MRI = MF->getRegInfo();
13648
13649  const BasicBlock *BB = MBB->getBasicBlock();
13650  MachineFunction::iterator I = MBB;
13651  ++I;
13652
13653  // Memory Reference
13654  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13655  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13656
13657  unsigned DstReg;
13658  unsigned MemOpndSlot = 0;
13659
13660  unsigned CurOp = 0;
13661
13662  DstReg = MI->getOperand(CurOp++).getReg();
13663  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
13664  assert(RC->hasType(MVT::i32) && "Invalid destination!");
13665  unsigned mainDstReg = MRI.createVirtualRegister(RC);
13666  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
13667
13668  MemOpndSlot = CurOp;
13669
13670  MVT PVT = getPointerTy();
13671  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13672         "Invalid Pointer Size!");
13673
13674  // For v = setjmp(buf), we generate
13675  //
13676  // thisMBB:
13677  //  buf[LabelOffset] = restoreMBB
13678  //  SjLjSetup restoreMBB
13679  //
13680  // mainMBB:
13681  //  v_main = 0
13682  //
13683  // sinkMBB:
13684  //  v = phi(main, restore)
13685  //
13686  // restoreMBB:
13687  //  v_restore = 1
13688
13689  MachineBasicBlock *thisMBB = MBB;
13690  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13691  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13692  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
13693  MF->insert(I, mainMBB);
13694  MF->insert(I, sinkMBB);
13695  MF->push_back(restoreMBB);
13696
13697  MachineInstrBuilder MIB;
13698
13699  // Transfer the remainder of BB and its successor edges to sinkMBB.
13700  sinkMBB->splice(sinkMBB->begin(), MBB,
13701                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
13702  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
13703
13704  // thisMBB:
13705  unsigned PtrStoreOpc = 0;
13706  unsigned LabelReg = 0;
13707  const int64_t LabelOffset = 1 * PVT.getStoreSize();
13708  Reloc::Model RM = getTargetMachine().getRelocationModel();
13709  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
13710                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
13711
13712  // Prepare IP either in reg or imm.
13713  if (!UseImmLabel) {
13714    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
13715    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
13716    LabelReg = MRI.createVirtualRegister(PtrRC);
13717    if (Subtarget->is64Bit()) {
13718      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
13719              .addReg(X86::RIP)
13720              .addImm(0)
13721              .addReg(0)
13722              .addMBB(restoreMBB)
13723              .addReg(0);
13724    } else {
13725      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
13726      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
13727              .addReg(XII->getGlobalBaseReg(MF))
13728              .addImm(0)
13729              .addReg(0)
13730              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
13731              .addReg(0);
13732    }
13733  } else
13734    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
13735  // Store IP
13736  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
13737  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13738    if (i == X86::AddrDisp)
13739      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
13740    else
13741      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
13742  }
13743  if (!UseImmLabel)
13744    MIB.addReg(LabelReg);
13745  else
13746    MIB.addMBB(restoreMBB);
13747  MIB.setMemRefs(MMOBegin, MMOEnd);
13748  // Setup
13749  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
13750          .addMBB(restoreMBB);
13751  MIB.addRegMask(RegInfo->getNoPreservedMask());
13752  thisMBB->addSuccessor(mainMBB);
13753  thisMBB->addSuccessor(restoreMBB);
13754
13755  // mainMBB:
13756  //  EAX = 0
13757  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
13758  mainMBB->addSuccessor(sinkMBB);
13759
13760  // sinkMBB:
13761  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13762          TII->get(X86::PHI), DstReg)
13763    .addReg(mainDstReg).addMBB(mainMBB)
13764    .addReg(restoreDstReg).addMBB(restoreMBB);
13765
13766  // restoreMBB:
13767  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
13768  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
13769  restoreMBB->addSuccessor(sinkMBB);
13770
13771  MI->eraseFromParent();
13772  return sinkMBB;
13773}
13774
13775MachineBasicBlock *
13776X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
13777                                     MachineBasicBlock *MBB) const {
13778  DebugLoc DL = MI->getDebugLoc();
13779  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13780
13781  MachineFunction *MF = MBB->getParent();
13782  MachineRegisterInfo &MRI = MF->getRegInfo();
13783
13784  // Memory Reference
13785  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13786  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13787
13788  MVT PVT = getPointerTy();
13789  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13790         "Invalid Pointer Size!");
13791
13792  const TargetRegisterClass *RC =
13793    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
13794  unsigned Tmp = MRI.createVirtualRegister(RC);
13795  // Since FP is only updated here but NOT referenced, it's treated as GPR.
13796  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
13797  unsigned SP = RegInfo->getStackRegister();
13798
13799  MachineInstrBuilder MIB;
13800
13801  const int64_t LabelOffset = 1 * PVT.getStoreSize();
13802  const int64_t SPOffset = 2 * PVT.getStoreSize();
13803
13804  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
13805  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
13806
13807  // Reload FP
13808  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
13809  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
13810    MIB.addOperand(MI->getOperand(i));
13811  MIB.setMemRefs(MMOBegin, MMOEnd);
13812  // Reload IP
13813  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
13814  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13815    if (i == X86::AddrDisp)
13816      MIB.addDisp(MI->getOperand(i), LabelOffset);
13817    else
13818      MIB.addOperand(MI->getOperand(i));
13819  }
13820  MIB.setMemRefs(MMOBegin, MMOEnd);
13821  // Reload SP
13822  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
13823  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13824    if (i == X86::AddrDisp)
13825      MIB.addDisp(MI->getOperand(i), SPOffset);
13826    else
13827      MIB.addOperand(MI->getOperand(i));
13828  }
13829  MIB.setMemRefs(MMOBegin, MMOEnd);
13830  // Jump
13831  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
13832
13833  MI->eraseFromParent();
13834  return MBB;
13835}
13836
13837MachineBasicBlock *
13838X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
13839                                               MachineBasicBlock *BB) const {
13840  switch (MI->getOpcode()) {
13841  default: llvm_unreachable("Unexpected instr type to insert");
13842  case X86::TAILJMPd64:
13843  case X86::TAILJMPr64:
13844  case X86::TAILJMPm64:
13845    llvm_unreachable("TAILJMP64 would not be touched here.");
13846  case X86::TCRETURNdi64:
13847  case X86::TCRETURNri64:
13848  case X86::TCRETURNmi64:
13849    return BB;
13850  case X86::WIN_ALLOCA:
13851    return EmitLoweredWinAlloca(MI, BB);
13852  case X86::SEG_ALLOCA_32:
13853    return EmitLoweredSegAlloca(MI, BB, false);
13854  case X86::SEG_ALLOCA_64:
13855    return EmitLoweredSegAlloca(MI, BB, true);
13856  case X86::TLSCall_32:
13857  case X86::TLSCall_64:
13858    return EmitLoweredTLSCall(MI, BB);
13859  case X86::CMOV_GR8:
13860  case X86::CMOV_FR32:
13861  case X86::CMOV_FR64:
13862  case X86::CMOV_V4F32:
13863  case X86::CMOV_V2F64:
13864  case X86::CMOV_V2I64:
13865  case X86::CMOV_V8F32:
13866  case X86::CMOV_V4F64:
13867  case X86::CMOV_V4I64:
13868  case X86::CMOV_GR16:
13869  case X86::CMOV_GR32:
13870  case X86::CMOV_RFP32:
13871  case X86::CMOV_RFP64:
13872  case X86::CMOV_RFP80:
13873    return EmitLoweredSelect(MI, BB);
13874
13875  case X86::FP32_TO_INT16_IN_MEM:
13876  case X86::FP32_TO_INT32_IN_MEM:
13877  case X86::FP32_TO_INT64_IN_MEM:
13878  case X86::FP64_TO_INT16_IN_MEM:
13879  case X86::FP64_TO_INT32_IN_MEM:
13880  case X86::FP64_TO_INT64_IN_MEM:
13881  case X86::FP80_TO_INT16_IN_MEM:
13882  case X86::FP80_TO_INT32_IN_MEM:
13883  case X86::FP80_TO_INT64_IN_MEM: {
13884    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13885    DebugLoc DL = MI->getDebugLoc();
13886
13887    // Change the floating point control register to use "round towards zero"
13888    // mode when truncating to an integer value.
13889    MachineFunction *F = BB->getParent();
13890    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
13891    addFrameReference(BuildMI(*BB, MI, DL,
13892                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
13893
13894    // Load the old value of the high byte of the control word...
13895    unsigned OldCW =
13896      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
13897    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
13898                      CWFrameIdx);
13899
13900    // Set the high part to be round to zero...
13901    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
13902      .addImm(0xC7F);
13903
13904    // Reload the modified control word now...
13905    addFrameReference(BuildMI(*BB, MI, DL,
13906                              TII->get(X86::FLDCW16m)), CWFrameIdx);
13907
13908    // Restore the memory image of control word to original value
13909    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
13910      .addReg(OldCW);
13911
13912    // Get the X86 opcode to use.
13913    unsigned Opc;
13914    switch (MI->getOpcode()) {
13915    default: llvm_unreachable("illegal opcode!");
13916    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
13917    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
13918    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
13919    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
13920    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
13921    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
13922    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
13923    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
13924    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
13925    }
13926
13927    X86AddressMode AM;
13928    MachineOperand &Op = MI->getOperand(0);
13929    if (Op.isReg()) {
13930      AM.BaseType = X86AddressMode::RegBase;
13931      AM.Base.Reg = Op.getReg();
13932    } else {
13933      AM.BaseType = X86AddressMode::FrameIndexBase;
13934      AM.Base.FrameIndex = Op.getIndex();
13935    }
13936    Op = MI->getOperand(1);
13937    if (Op.isImm())
13938      AM.Scale = Op.getImm();
13939    Op = MI->getOperand(2);
13940    if (Op.isImm())
13941      AM.IndexReg = Op.getImm();
13942    Op = MI->getOperand(3);
13943    if (Op.isGlobal()) {
13944      AM.GV = Op.getGlobal();
13945    } else {
13946      AM.Disp = Op.getImm();
13947    }
13948    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
13949                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
13950
13951    // Reload the original control word now.
13952    addFrameReference(BuildMI(*BB, MI, DL,
13953                              TII->get(X86::FLDCW16m)), CWFrameIdx);
13954
13955    MI->eraseFromParent();   // The pseudo instruction is gone now.
13956    return BB;
13957  }
13958    // String/text processing lowering.
13959  case X86::PCMPISTRM128REG:
13960  case X86::VPCMPISTRM128REG:
13961  case X86::PCMPISTRM128MEM:
13962  case X86::VPCMPISTRM128MEM:
13963  case X86::PCMPESTRM128REG:
13964  case X86::VPCMPESTRM128REG:
13965  case X86::PCMPESTRM128MEM:
13966  case X86::VPCMPESTRM128MEM:
13967    assert(Subtarget->hasSSE42() &&
13968           "Target must have SSE4.2 or AVX features enabled");
13969    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
13970
13971  // String/text processing lowering.
13972  case X86::PCMPISTRIREG:
13973  case X86::VPCMPISTRIREG:
13974  case X86::PCMPISTRIMEM:
13975  case X86::VPCMPISTRIMEM:
13976  case X86::PCMPESTRIREG:
13977  case X86::VPCMPESTRIREG:
13978  case X86::PCMPESTRIMEM:
13979  case X86::VPCMPESTRIMEM:
13980    assert(Subtarget->hasSSE42() &&
13981           "Target must have SSE4.2 or AVX features enabled");
13982    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
13983
13984  // Thread synchronization.
13985  case X86::MONITOR:
13986    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
13987
13988  // xbegin
13989  case X86::XBEGIN:
13990    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
13991
13992  // Atomic Lowering.
13993  case X86::ATOMAND8:
13994  case X86::ATOMAND16:
13995  case X86::ATOMAND32:
13996  case X86::ATOMAND64:
13997    // Fall through
13998  case X86::ATOMOR8:
13999  case X86::ATOMOR16:
14000  case X86::ATOMOR32:
14001  case X86::ATOMOR64:
14002    // Fall through
14003  case X86::ATOMXOR16:
14004  case X86::ATOMXOR8:
14005  case X86::ATOMXOR32:
14006  case X86::ATOMXOR64:
14007    // Fall through
14008  case X86::ATOMNAND8:
14009  case X86::ATOMNAND16:
14010  case X86::ATOMNAND32:
14011  case X86::ATOMNAND64:
14012    // Fall through
14013  case X86::ATOMMAX8:
14014  case X86::ATOMMAX16:
14015  case X86::ATOMMAX32:
14016  case X86::ATOMMAX64:
14017    // Fall through
14018  case X86::ATOMMIN8:
14019  case X86::ATOMMIN16:
14020  case X86::ATOMMIN32:
14021  case X86::ATOMMIN64:
14022    // Fall through
14023  case X86::ATOMUMAX8:
14024  case X86::ATOMUMAX16:
14025  case X86::ATOMUMAX32:
14026  case X86::ATOMUMAX64:
14027    // Fall through
14028  case X86::ATOMUMIN8:
14029  case X86::ATOMUMIN16:
14030  case X86::ATOMUMIN32:
14031  case X86::ATOMUMIN64:
14032    return EmitAtomicLoadArith(MI, BB);
14033
14034  // This group does 64-bit operations on a 32-bit host.
14035  case X86::ATOMAND6432:
14036  case X86::ATOMOR6432:
14037  case X86::ATOMXOR6432:
14038  case X86::ATOMNAND6432:
14039  case X86::ATOMADD6432:
14040  case X86::ATOMSUB6432:
14041  case X86::ATOMMAX6432:
14042  case X86::ATOMMIN6432:
14043  case X86::ATOMUMAX6432:
14044  case X86::ATOMUMIN6432:
14045  case X86::ATOMSWAP6432:
14046    return EmitAtomicLoadArith6432(MI, BB);
14047
14048  case X86::VASTART_SAVE_XMM_REGS:
14049    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
14050
14051  case X86::VAARG_64:
14052    return EmitVAARG64WithCustomInserter(MI, BB);
14053
14054  case X86::EH_SjLj_SetJmp32:
14055  case X86::EH_SjLj_SetJmp64:
14056    return emitEHSjLjSetJmp(MI, BB);
14057
14058  case X86::EH_SjLj_LongJmp32:
14059  case X86::EH_SjLj_LongJmp64:
14060    return emitEHSjLjLongJmp(MI, BB);
14061  }
14062}
14063
14064//===----------------------------------------------------------------------===//
14065//                           X86 Optimization Hooks
14066//===----------------------------------------------------------------------===//
14067
14068void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
14069                                                       APInt &KnownZero,
14070                                                       APInt &KnownOne,
14071                                                       const SelectionDAG &DAG,
14072                                                       unsigned Depth) const {
14073  unsigned BitWidth = KnownZero.getBitWidth();
14074  unsigned Opc = Op.getOpcode();
14075  assert((Opc >= ISD::BUILTIN_OP_END ||
14076          Opc == ISD::INTRINSIC_WO_CHAIN ||
14077          Opc == ISD::INTRINSIC_W_CHAIN ||
14078          Opc == ISD::INTRINSIC_VOID) &&
14079         "Should use MaskedValueIsZero if you don't know whether Op"
14080         " is a target node!");
14081
14082  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
14083  switch (Opc) {
14084  default: break;
14085  case X86ISD::ADD:
14086  case X86ISD::SUB:
14087  case X86ISD::ADC:
14088  case X86ISD::SBB:
14089  case X86ISD::SMUL:
14090  case X86ISD::UMUL:
14091  case X86ISD::INC:
14092  case X86ISD::DEC:
14093  case X86ISD::OR:
14094  case X86ISD::XOR:
14095  case X86ISD::AND:
14096    // These nodes' second result is a boolean.
14097    if (Op.getResNo() == 0)
14098      break;
14099    // Fallthrough
14100  case X86ISD::SETCC:
14101    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
14102    break;
14103  case ISD::INTRINSIC_WO_CHAIN: {
14104    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
14105    unsigned NumLoBits = 0;
14106    switch (IntId) {
14107    default: break;
14108    case Intrinsic::x86_sse_movmsk_ps:
14109    case Intrinsic::x86_avx_movmsk_ps_256:
14110    case Intrinsic::x86_sse2_movmsk_pd:
14111    case Intrinsic::x86_avx_movmsk_pd_256:
14112    case Intrinsic::x86_mmx_pmovmskb:
14113    case Intrinsic::x86_sse2_pmovmskb_128:
14114    case Intrinsic::x86_avx2_pmovmskb: {
14115      // High bits of movmskp{s|d}, pmovmskb are known zero.
14116      switch (IntId) {
14117        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
14118        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
14119        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
14120        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
14121        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
14122        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
14123        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
14124        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
14125      }
14126      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
14127      break;
14128    }
14129    }
14130    break;
14131  }
14132  }
14133}
14134
14135unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
14136                                                         unsigned Depth) const {
14137  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
14138  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
14139    return Op.getValueType().getScalarType().getSizeInBits();
14140
14141  // Fallback case.
14142  return 1;
14143}
14144
14145/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
14146/// node is a GlobalAddress + offset.
14147bool X86TargetLowering::isGAPlusOffset(SDNode *N,
14148                                       const GlobalValue* &GA,
14149                                       int64_t &Offset) const {
14150  if (N->getOpcode() == X86ISD::Wrapper) {
14151    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
14152      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
14153      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
14154      return true;
14155    }
14156  }
14157  return TargetLowering::isGAPlusOffset(N, GA, Offset);
14158}
14159
14160/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
14161/// same as extracting the high 128-bit part of 256-bit vector and then
14162/// inserting the result into the low part of a new 256-bit vector
14163static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
14164  EVT VT = SVOp->getValueType(0);
14165  unsigned NumElems = VT.getVectorNumElements();
14166
14167  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
14168  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
14169    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
14170        SVOp->getMaskElt(j) >= 0)
14171      return false;
14172
14173  return true;
14174}
14175
14176/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
14177/// same as extracting the low 128-bit part of 256-bit vector and then
14178/// inserting the result into the high part of a new 256-bit vector
14179static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
14180  EVT VT = SVOp->getValueType(0);
14181  unsigned NumElems = VT.getVectorNumElements();
14182
14183  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
14184  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
14185    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
14186        SVOp->getMaskElt(j) >= 0)
14187      return false;
14188
14189  return true;
14190}
14191
14192/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
14193static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
14194                                        TargetLowering::DAGCombinerInfo &DCI,
14195                                        const X86Subtarget* Subtarget) {
14196  DebugLoc dl = N->getDebugLoc();
14197  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
14198  SDValue V1 = SVOp->getOperand(0);
14199  SDValue V2 = SVOp->getOperand(1);
14200  EVT VT = SVOp->getValueType(0);
14201  unsigned NumElems = VT.getVectorNumElements();
14202
14203  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
14204      V2.getOpcode() == ISD::CONCAT_VECTORS) {
14205    //
14206    //                   0,0,0,...
14207    //                      |
14208    //    V      UNDEF    BUILD_VECTOR    UNDEF
14209    //     \      /           \           /
14210    //  CONCAT_VECTOR         CONCAT_VECTOR
14211    //         \                  /
14212    //          \                /
14213    //          RESULT: V + zero extended
14214    //
14215    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
14216        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
14217        V1.getOperand(1).getOpcode() != ISD::UNDEF)
14218      return SDValue();
14219
14220    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
14221      return SDValue();
14222
14223    // To match the shuffle mask, the first half of the mask should
14224    // be exactly the first vector, and all the rest a splat with the
14225    // first element of the second one.
14226    for (unsigned i = 0; i != NumElems/2; ++i)
14227      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
14228          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
14229        return SDValue();
14230
14231    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
14232    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
14233      if (Ld->hasNUsesOfValue(1, 0)) {
14234        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
14235        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
14236        SDValue ResNode =
14237          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
14238                                  Ld->getMemoryVT(),
14239                                  Ld->getPointerInfo(),
14240                                  Ld->getAlignment(),
14241                                  false/*isVolatile*/, true/*ReadMem*/,
14242                                  false/*WriteMem*/);
14243
14244        // Make sure the newly-created LOAD is in the same position as Ld in
14245        // terms of dependency. We create a TokenFactor for Ld and ResNode,
14246        // and update uses of Ld's output chain to use the TokenFactor.
14247        if (Ld->hasAnyUseOfValue(1)) {
14248          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
14249                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
14250          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
14251          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
14252                                 SDValue(ResNode.getNode(), 1));
14253        }
14254
14255        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
14256      }
14257    }
14258
14259    // Emit a zeroed vector and insert the desired subvector on its
14260    // first half.
14261    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
14262    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
14263    return DCI.CombineTo(N, InsV);
14264  }
14265
14266  //===--------------------------------------------------------------------===//
14267  // Combine some shuffles into subvector extracts and inserts:
14268  //
14269
14270  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
14271  if (isShuffleHigh128VectorInsertLow(SVOp)) {
14272    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
14273    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
14274    return DCI.CombineTo(N, InsV);
14275  }
14276
14277  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
14278  if (isShuffleLow128VectorInsertHigh(SVOp)) {
14279    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
14280    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
14281    return DCI.CombineTo(N, InsV);
14282  }
14283
14284  return SDValue();
14285}
14286
14287/// PerformShuffleCombine - Performs several different shuffle combines.
14288static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
14289                                     TargetLowering::DAGCombinerInfo &DCI,
14290                                     const X86Subtarget *Subtarget) {
14291  DebugLoc dl = N->getDebugLoc();
14292  EVT VT = N->getValueType(0);
14293
14294  // Don't create instructions with illegal types after legalize types has run.
14295  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14296  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
14297    return SDValue();
14298
14299  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
14300  if (Subtarget->hasFp256() && VT.is256BitVector() &&
14301      N->getOpcode() == ISD::VECTOR_SHUFFLE)
14302    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
14303
14304  // Only handle 128 wide vector from here on.
14305  if (!VT.is128BitVector())
14306    return SDValue();
14307
14308  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
14309  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
14310  // consecutive, non-overlapping, and in the right order.
14311  SmallVector<SDValue, 16> Elts;
14312  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
14313    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
14314
14315  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
14316}
14317
14318
14319/// PerformTruncateCombine - Converts truncate operation to
14320/// a sequence of vector shuffle operations.
14321/// It is possible when we truncate 256-bit vector to 128-bit vector
14322static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
14323                                      TargetLowering::DAGCombinerInfo &DCI,
14324                                      const X86Subtarget *Subtarget)  {
14325  if (!DCI.isBeforeLegalizeOps())
14326    return SDValue();
14327
14328  if (!Subtarget->hasFp256())
14329    return SDValue();
14330
14331  EVT VT = N->getValueType(0);
14332  SDValue Op = N->getOperand(0);
14333  EVT OpVT = Op.getValueType();
14334  DebugLoc dl = N->getDebugLoc();
14335
14336  if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
14337
14338    if (Subtarget->hasInt256()) {
14339      // AVX2: v4i64 -> v4i32
14340
14341      // VPERMD
14342      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14343
14344      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
14345      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
14346                                ShufMask);
14347
14348      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
14349                         DAG.getIntPtrConstant(0));
14350    }
14351
14352    // AVX: v4i64 -> v4i32
14353    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
14354                               DAG.getIntPtrConstant(0));
14355
14356    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
14357                               DAG.getIntPtrConstant(2));
14358
14359    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
14360    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
14361
14362    // PSHUFD
14363    static const int ShufMask1[] = {0, 2, 0, 0};
14364
14365    SDValue Undef = DAG.getUNDEF(VT);
14366    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
14367    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
14368
14369    // MOVLHPS
14370    static const int ShufMask2[] = {0, 1, 4, 5};
14371
14372    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
14373  }
14374
14375  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
14376
14377    if (Subtarget->hasInt256()) {
14378      // AVX2: v8i32 -> v8i16
14379
14380      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
14381
14382      // PSHUFB
14383      SmallVector<SDValue,32> pshufbMask;
14384      for (unsigned i = 0; i < 2; ++i) {
14385        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14386        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14387        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14388        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14389        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14390        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14391        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14392        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14393        for (unsigned j = 0; j < 8; ++j)
14394          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14395      }
14396      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
14397                               &pshufbMask[0], 32);
14398      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
14399
14400      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
14401
14402      static const int ShufMask[] = {0,  2,  -1,  -1};
14403      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
14404                                &ShufMask[0]);
14405
14406      Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
14407                       DAG.getIntPtrConstant(0));
14408
14409      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
14410    }
14411
14412    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
14413                               DAG.getIntPtrConstant(0));
14414
14415    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
14416                               DAG.getIntPtrConstant(4));
14417
14418    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
14419    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
14420
14421    // PSHUFB
14422    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14423                                   -1, -1, -1, -1, -1, -1, -1, -1};
14424
14425    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14426    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
14427    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
14428
14429    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
14430    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
14431
14432    // MOVLHPS
14433    static const int ShufMask2[] = {0, 1, 4, 5};
14434
14435    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
14436    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
14437  }
14438
14439  return SDValue();
14440}
14441
14442/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
14443/// specific shuffle of a load can be folded into a single element load.
14444/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
14445/// shuffles have been customed lowered so we need to handle those here.
14446static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
14447                                         TargetLowering::DAGCombinerInfo &DCI) {
14448  if (DCI.isBeforeLegalizeOps())
14449    return SDValue();
14450
14451  SDValue InVec = N->getOperand(0);
14452  SDValue EltNo = N->getOperand(1);
14453
14454  if (!isa<ConstantSDNode>(EltNo))
14455    return SDValue();
14456
14457  EVT VT = InVec.getValueType();
14458
14459  bool HasShuffleIntoBitcast = false;
14460  if (InVec.getOpcode() == ISD::BITCAST) {
14461    // Don't duplicate a load with other uses.
14462    if (!InVec.hasOneUse())
14463      return SDValue();
14464    EVT BCVT = InVec.getOperand(0).getValueType();
14465    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
14466      return SDValue();
14467    InVec = InVec.getOperand(0);
14468    HasShuffleIntoBitcast = true;
14469  }
14470
14471  if (!isTargetShuffle(InVec.getOpcode()))
14472    return SDValue();
14473
14474  // Don't duplicate a load with other uses.
14475  if (!InVec.hasOneUse())
14476    return SDValue();
14477
14478  SmallVector<int, 16> ShuffleMask;
14479  bool UnaryShuffle;
14480  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
14481                            UnaryShuffle))
14482    return SDValue();
14483
14484  // Select the input vector, guarding against out of range extract vector.
14485  unsigned NumElems = VT.getVectorNumElements();
14486  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
14487  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
14488  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
14489                                         : InVec.getOperand(1);
14490
14491  // If inputs to shuffle are the same for both ops, then allow 2 uses
14492  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
14493
14494  if (LdNode.getOpcode() == ISD::BITCAST) {
14495    // Don't duplicate a load with other uses.
14496    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
14497      return SDValue();
14498
14499    AllowedUses = 1; // only allow 1 load use if we have a bitcast
14500    LdNode = LdNode.getOperand(0);
14501  }
14502
14503  if (!ISD::isNormalLoad(LdNode.getNode()))
14504    return SDValue();
14505
14506  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
14507
14508  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
14509    return SDValue();
14510
14511  if (HasShuffleIntoBitcast) {
14512    // If there's a bitcast before the shuffle, check if the load type and
14513    // alignment is valid.
14514    unsigned Align = LN0->getAlignment();
14515    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14516    unsigned NewAlign = TLI.getDataLayout()->
14517      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
14518
14519    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
14520      return SDValue();
14521  }
14522
14523  // All checks match so transform back to vector_shuffle so that DAG combiner
14524  // can finish the job
14525  DebugLoc dl = N->getDebugLoc();
14526
14527  // Create shuffle node taking into account the case that its a unary shuffle
14528  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
14529  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
14530                                 InVec.getOperand(0), Shuffle,
14531                                 &ShuffleMask[0]);
14532  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
14533  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
14534                     EltNo);
14535}
14536
14537/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
14538/// generation and convert it from being a bunch of shuffles and extracts
14539/// to a simple store and scalar loads to extract the elements.
14540static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
14541                                         TargetLowering::DAGCombinerInfo &DCI) {
14542  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
14543  if (NewOp.getNode())
14544    return NewOp;
14545
14546  SDValue InputVector = N->getOperand(0);
14547  // Detect whether we are trying to convert from mmx to i32 and the bitcast
14548  // from mmx to v2i32 has a single usage.
14549  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
14550      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
14551      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
14552    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
14553                       N->getValueType(0),
14554                       InputVector.getNode()->getOperand(0));
14555
14556  // Only operate on vectors of 4 elements, where the alternative shuffling
14557  // gets to be more expensive.
14558  if (InputVector.getValueType() != MVT::v4i32)
14559    return SDValue();
14560
14561  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
14562  // single use which is a sign-extend or zero-extend, and all elements are
14563  // used.
14564  SmallVector<SDNode *, 4> Uses;
14565  unsigned ExtractedElements = 0;
14566  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
14567       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
14568    if (UI.getUse().getResNo() != InputVector.getResNo())
14569      return SDValue();
14570
14571    SDNode *Extract = *UI;
14572    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14573      return SDValue();
14574
14575    if (Extract->getValueType(0) != MVT::i32)
14576      return SDValue();
14577    if (!Extract->hasOneUse())
14578      return SDValue();
14579    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
14580        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
14581      return SDValue();
14582    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
14583      return SDValue();
14584
14585    // Record which element was extracted.
14586    ExtractedElements |=
14587      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
14588
14589    Uses.push_back(Extract);
14590  }
14591
14592  // If not all the elements were used, this may not be worthwhile.
14593  if (ExtractedElements != 15)
14594    return SDValue();
14595
14596  // Ok, we've now decided to do the transformation.
14597  DebugLoc dl = InputVector.getDebugLoc();
14598
14599  // Store the value to a temporary stack slot.
14600  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
14601  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
14602                            MachinePointerInfo(), false, false, 0);
14603
14604  // Replace each use (extract) with a load of the appropriate element.
14605  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
14606       UE = Uses.end(); UI != UE; ++UI) {
14607    SDNode *Extract = *UI;
14608
14609    // cOMpute the element's address.
14610    SDValue Idx = Extract->getOperand(1);
14611    unsigned EltSize =
14612        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
14613    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
14614    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14615    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
14616
14617    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
14618                                     StackPtr, OffsetVal);
14619
14620    // Load the scalar.
14621    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
14622                                     ScalarAddr, MachinePointerInfo(),
14623                                     false, false, false, 0);
14624
14625    // Replace the exact with the load.
14626    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
14627  }
14628
14629  // The replacement was made in place; don't return anything.
14630  return SDValue();
14631}
14632
14633/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
14634/// nodes.
14635static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
14636                                    TargetLowering::DAGCombinerInfo &DCI,
14637                                    const X86Subtarget *Subtarget) {
14638  DebugLoc DL = N->getDebugLoc();
14639  SDValue Cond = N->getOperand(0);
14640  // Get the LHS/RHS of the select.
14641  SDValue LHS = N->getOperand(1);
14642  SDValue RHS = N->getOperand(2);
14643  EVT VT = LHS.getValueType();
14644
14645  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
14646  // instructions match the semantics of the common C idiom x<y?x:y but not
14647  // x<=y?x:y, because of how they handle negative zero (which can be
14648  // ignored in unsafe-math mode).
14649  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
14650      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
14651      (Subtarget->hasSSE2() ||
14652       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
14653    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14654
14655    unsigned Opcode = 0;
14656    // Check for x CC y ? x : y.
14657    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
14658        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
14659      switch (CC) {
14660      default: break;
14661      case ISD::SETULT:
14662        // Converting this to a min would handle NaNs incorrectly, and swapping
14663        // the operands would cause it to handle comparisons between positive
14664        // and negative zero incorrectly.
14665        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
14666          if (!DAG.getTarget().Options.UnsafeFPMath &&
14667              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
14668            break;
14669          std::swap(LHS, RHS);
14670        }
14671        Opcode = X86ISD::FMIN;
14672        break;
14673      case ISD::SETOLE:
14674        // Converting this to a min would handle comparisons between positive
14675        // and negative zero incorrectly.
14676        if (!DAG.getTarget().Options.UnsafeFPMath &&
14677            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
14678          break;
14679        Opcode = X86ISD::FMIN;
14680        break;
14681      case ISD::SETULE:
14682        // Converting this to a min would handle both negative zeros and NaNs
14683        // incorrectly, but we can swap the operands to fix both.
14684        std::swap(LHS, RHS);
14685      case ISD::SETOLT:
14686      case ISD::SETLT:
14687      case ISD::SETLE:
14688        Opcode = X86ISD::FMIN;
14689        break;
14690
14691      case ISD::SETOGE:
14692        // Converting this to a max would handle comparisons between positive
14693        // and negative zero incorrectly.
14694        if (!DAG.getTarget().Options.UnsafeFPMath &&
14695            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
14696          break;
14697        Opcode = X86ISD::FMAX;
14698        break;
14699      case ISD::SETUGT:
14700        // Converting this to a max would handle NaNs incorrectly, and swapping
14701        // the operands would cause it to handle comparisons between positive
14702        // and negative zero incorrectly.
14703        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
14704          if (!DAG.getTarget().Options.UnsafeFPMath &&
14705              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
14706            break;
14707          std::swap(LHS, RHS);
14708        }
14709        Opcode = X86ISD::FMAX;
14710        break;
14711      case ISD::SETUGE:
14712        // Converting this to a max would handle both negative zeros and NaNs
14713        // incorrectly, but we can swap the operands to fix both.
14714        std::swap(LHS, RHS);
14715      case ISD::SETOGT:
14716      case ISD::SETGT:
14717      case ISD::SETGE:
14718        Opcode = X86ISD::FMAX;
14719        break;
14720      }
14721    // Check for x CC y ? y : x -- a min/max with reversed arms.
14722    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
14723               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
14724      switch (CC) {
14725      default: break;
14726      case ISD::SETOGE:
14727        // Converting this to a min would handle comparisons between positive
14728        // and negative zero incorrectly, and swapping the operands would
14729        // cause it to handle NaNs incorrectly.
14730        if (!DAG.getTarget().Options.UnsafeFPMath &&
14731            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
14732          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
14733            break;
14734          std::swap(LHS, RHS);
14735        }
14736        Opcode = X86ISD::FMIN;
14737        break;
14738      case ISD::SETUGT:
14739        // Converting this to a min would handle NaNs incorrectly.
14740        if (!DAG.getTarget().Options.UnsafeFPMath &&
14741            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
14742          break;
14743        Opcode = X86ISD::FMIN;
14744        break;
14745      case ISD::SETUGE:
14746        // Converting this to a min would handle both negative zeros and NaNs
14747        // incorrectly, but we can swap the operands to fix both.
14748        std::swap(LHS, RHS);
14749      case ISD::SETOGT:
14750      case ISD::SETGT:
14751      case ISD::SETGE:
14752        Opcode = X86ISD::FMIN;
14753        break;
14754
14755      case ISD::SETULT:
14756        // Converting this to a max would handle NaNs incorrectly.
14757        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
14758          break;
14759        Opcode = X86ISD::FMAX;
14760        break;
14761      case ISD::SETOLE:
14762        // Converting this to a max would handle comparisons between positive
14763        // and negative zero incorrectly, and swapping the operands would
14764        // cause it to handle NaNs incorrectly.
14765        if (!DAG.getTarget().Options.UnsafeFPMath &&
14766            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
14767          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
14768            break;
14769          std::swap(LHS, RHS);
14770        }
14771        Opcode = X86ISD::FMAX;
14772        break;
14773      case ISD::SETULE:
14774        // Converting this to a max would handle both negative zeros and NaNs
14775        // incorrectly, but we can swap the operands to fix both.
14776        std::swap(LHS, RHS);
14777      case ISD::SETOLT:
14778      case ISD::SETLT:
14779      case ISD::SETLE:
14780        Opcode = X86ISD::FMAX;
14781        break;
14782      }
14783    }
14784
14785    if (Opcode)
14786      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
14787  }
14788
14789  // If this is a select between two integer constants, try to do some
14790  // optimizations.
14791  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
14792    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
14793      // Don't do this for crazy integer types.
14794      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
14795        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
14796        // so that TrueC (the true value) is larger than FalseC.
14797        bool NeedsCondInvert = false;
14798
14799        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
14800            // Efficiently invertible.
14801            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
14802             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
14803              isa<ConstantSDNode>(Cond.getOperand(1))))) {
14804          NeedsCondInvert = true;
14805          std::swap(TrueC, FalseC);
14806        }
14807
14808        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
14809        if (FalseC->getAPIntValue() == 0 &&
14810            TrueC->getAPIntValue().isPowerOf2()) {
14811          if (NeedsCondInvert) // Invert the condition if needed.
14812            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14813                               DAG.getConstant(1, Cond.getValueType()));
14814
14815          // Zero extend the condition if needed.
14816          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
14817
14818          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
14819          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
14820                             DAG.getConstant(ShAmt, MVT::i8));
14821        }
14822
14823        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
14824        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
14825          if (NeedsCondInvert) // Invert the condition if needed.
14826            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14827                               DAG.getConstant(1, Cond.getValueType()));
14828
14829          // Zero extend the condition if needed.
14830          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
14831                             FalseC->getValueType(0), Cond);
14832          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14833                             SDValue(FalseC, 0));
14834        }
14835
14836        // Optimize cases that will turn into an LEA instruction.  This requires
14837        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
14838        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
14839          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
14840          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
14841
14842          bool isFastMultiplier = false;
14843          if (Diff < 10) {
14844            switch ((unsigned char)Diff) {
14845              default: break;
14846              case 1:  // result = add base, cond
14847              case 2:  // result = lea base(    , cond*2)
14848              case 3:  // result = lea base(cond, cond*2)
14849              case 4:  // result = lea base(    , cond*4)
14850              case 5:  // result = lea base(cond, cond*4)
14851              case 8:  // result = lea base(    , cond*8)
14852              case 9:  // result = lea base(cond, cond*8)
14853                isFastMultiplier = true;
14854                break;
14855            }
14856          }
14857
14858          if (isFastMultiplier) {
14859            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
14860            if (NeedsCondInvert) // Invert the condition if needed.
14861              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14862                                 DAG.getConstant(1, Cond.getValueType()));
14863
14864            // Zero extend the condition if needed.
14865            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
14866                               Cond);
14867            // Scale the condition by the difference.
14868            if (Diff != 1)
14869              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
14870                                 DAG.getConstant(Diff, Cond.getValueType()));
14871
14872            // Add the base if non-zero.
14873            if (FalseC->getAPIntValue() != 0)
14874              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14875                                 SDValue(FalseC, 0));
14876            return Cond;
14877          }
14878        }
14879      }
14880  }
14881
14882  // Canonicalize max and min:
14883  // (x > y) ? x : y -> (x >= y) ? x : y
14884  // (x < y) ? x : y -> (x <= y) ? x : y
14885  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
14886  // the need for an extra compare
14887  // against zero. e.g.
14888  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
14889  // subl   %esi, %edi
14890  // testl  %edi, %edi
14891  // movl   $0, %eax
14892  // cmovgl %edi, %eax
14893  // =>
14894  // xorl   %eax, %eax
14895  // subl   %esi, $edi
14896  // cmovsl %eax, %edi
14897  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
14898      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
14899      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
14900    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14901    switch (CC) {
14902    default: break;
14903    case ISD::SETLT:
14904    case ISD::SETGT: {
14905      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
14906      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
14907                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
14908      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
14909    }
14910    }
14911  }
14912
14913  // If we know that this node is legal then we know that it is going to be
14914  // matched by one of the SSE/AVX BLEND instructions. These instructions only
14915  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
14916  // to simplify previous instructions.
14917  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14918  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
14919      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
14920    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
14921
14922    // Don't optimize vector selects that map to mask-registers.
14923    if (BitWidth == 1)
14924      return SDValue();
14925
14926    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
14927    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
14928
14929    APInt KnownZero, KnownOne;
14930    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
14931                                          DCI.isBeforeLegalizeOps());
14932    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
14933        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
14934      DCI.CommitTargetLoweringOpt(TLO);
14935  }
14936
14937  return SDValue();
14938}
14939
14940// Check whether a boolean test is testing a boolean value generated by
14941// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
14942// code.
14943//
14944// Simplify the following patterns:
14945// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
14946// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
14947// to (Op EFLAGS Cond)
14948//
14949// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
14950// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
14951// to (Op EFLAGS !Cond)
14952//
14953// where Op could be BRCOND or CMOV.
14954//
14955static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
14956  // Quit if not CMP and SUB with its value result used.
14957  if (Cmp.getOpcode() != X86ISD::CMP &&
14958      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
14959      return SDValue();
14960
14961  // Quit if not used as a boolean value.
14962  if (CC != X86::COND_E && CC != X86::COND_NE)
14963    return SDValue();
14964
14965  // Check CMP operands. One of them should be 0 or 1 and the other should be
14966  // an SetCC or extended from it.
14967  SDValue Op1 = Cmp.getOperand(0);
14968  SDValue Op2 = Cmp.getOperand(1);
14969
14970  SDValue SetCC;
14971  const ConstantSDNode* C = 0;
14972  bool needOppositeCond = (CC == X86::COND_E);
14973
14974  if ((C = dyn_cast<ConstantSDNode>(Op1)))
14975    SetCC = Op2;
14976  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
14977    SetCC = Op1;
14978  else // Quit if all operands are not constants.
14979    return SDValue();
14980
14981  if (C->getZExtValue() == 1)
14982    needOppositeCond = !needOppositeCond;
14983  else if (C->getZExtValue() != 0)
14984    // Quit if the constant is neither 0 or 1.
14985    return SDValue();
14986
14987  // Skip 'zext' node.
14988  if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
14989    SetCC = SetCC.getOperand(0);
14990
14991  switch (SetCC.getOpcode()) {
14992  case X86ISD::SETCC:
14993    // Set the condition code or opposite one if necessary.
14994    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
14995    if (needOppositeCond)
14996      CC = X86::GetOppositeBranchCondition(CC);
14997    return SetCC.getOperand(1);
14998  case X86ISD::CMOV: {
14999    // Check whether false/true value has canonical one, i.e. 0 or 1.
15000    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
15001    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
15002    // Quit if true value is not a constant.
15003    if (!TVal)
15004      return SDValue();
15005    // Quit if false value is not a constant.
15006    if (!FVal) {
15007      // A special case for rdrand, where 0 is set if false cond is found.
15008      SDValue Op = SetCC.getOperand(0);
15009      if (Op.getOpcode() != X86ISD::RDRAND)
15010        return SDValue();
15011    }
15012    // Quit if false value is not the constant 0 or 1.
15013    bool FValIsFalse = true;
15014    if (FVal && FVal->getZExtValue() != 0) {
15015      if (FVal->getZExtValue() != 1)
15016        return SDValue();
15017      // If FVal is 1, opposite cond is needed.
15018      needOppositeCond = !needOppositeCond;
15019      FValIsFalse = false;
15020    }
15021    // Quit if TVal is not the constant opposite of FVal.
15022    if (FValIsFalse && TVal->getZExtValue() != 1)
15023      return SDValue();
15024    if (!FValIsFalse && TVal->getZExtValue() != 0)
15025      return SDValue();
15026    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
15027    if (needOppositeCond)
15028      CC = X86::GetOppositeBranchCondition(CC);
15029    return SetCC.getOperand(3);
15030  }
15031  }
15032
15033  return SDValue();
15034}
15035
15036/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
15037static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
15038                                  TargetLowering::DAGCombinerInfo &DCI,
15039                                  const X86Subtarget *Subtarget) {
15040  DebugLoc DL = N->getDebugLoc();
15041
15042  // If the flag operand isn't dead, don't touch this CMOV.
15043  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
15044    return SDValue();
15045
15046  SDValue FalseOp = N->getOperand(0);
15047  SDValue TrueOp = N->getOperand(1);
15048  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
15049  SDValue Cond = N->getOperand(3);
15050
15051  if (CC == X86::COND_E || CC == X86::COND_NE) {
15052    switch (Cond.getOpcode()) {
15053    default: break;
15054    case X86ISD::BSR:
15055    case X86ISD::BSF:
15056      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
15057      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
15058        return (CC == X86::COND_E) ? FalseOp : TrueOp;
15059    }
15060  }
15061
15062  SDValue Flags;
15063
15064  Flags = checkBoolTestSetCCCombine(Cond, CC);
15065  if (Flags.getNode() &&
15066      // Extra check as FCMOV only supports a subset of X86 cond.
15067      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
15068    SDValue Ops[] = { FalseOp, TrueOp,
15069                      DAG.getConstant(CC, MVT::i8), Flags };
15070    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
15071                       Ops, array_lengthof(Ops));
15072  }
15073
15074  // If this is a select between two integer constants, try to do some
15075  // optimizations.  Note that the operands are ordered the opposite of SELECT
15076  // operands.
15077  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
15078    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
15079      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
15080      // larger than FalseC (the false value).
15081      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
15082        CC = X86::GetOppositeBranchCondition(CC);
15083        std::swap(TrueC, FalseC);
15084        std::swap(TrueOp, FalseOp);
15085      }
15086
15087      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
15088      // This is efficient for any integer data type (including i8/i16) and
15089      // shift amount.
15090      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
15091        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15092                           DAG.getConstant(CC, MVT::i8), Cond);
15093
15094        // Zero extend the condition if needed.
15095        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
15096
15097        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
15098        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
15099                           DAG.getConstant(ShAmt, MVT::i8));
15100        if (N->getNumValues() == 2)  // Dead flag value?
15101          return DCI.CombineTo(N, Cond, SDValue());
15102        return Cond;
15103      }
15104
15105      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
15106      // for any integer data type, including i8/i16.
15107      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
15108        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15109                           DAG.getConstant(CC, MVT::i8), Cond);
15110
15111        // Zero extend the condition if needed.
15112        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
15113                           FalseC->getValueType(0), Cond);
15114        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
15115                           SDValue(FalseC, 0));
15116
15117        if (N->getNumValues() == 2)  // Dead flag value?
15118          return DCI.CombineTo(N, Cond, SDValue());
15119        return Cond;
15120      }
15121
15122      // Optimize cases that will turn into an LEA instruction.  This requires
15123      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
15124      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
15125        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
15126        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
15127
15128        bool isFastMultiplier = false;
15129        if (Diff < 10) {
15130          switch ((unsigned char)Diff) {
15131          default: break;
15132          case 1:  // result = add base, cond
15133          case 2:  // result = lea base(    , cond*2)
15134          case 3:  // result = lea base(cond, cond*2)
15135          case 4:  // result = lea base(    , cond*4)
15136          case 5:  // result = lea base(cond, cond*4)
15137          case 8:  // result = lea base(    , cond*8)
15138          case 9:  // result = lea base(cond, cond*8)
15139            isFastMultiplier = true;
15140            break;
15141          }
15142        }
15143
15144        if (isFastMultiplier) {
15145          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
15146          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15147                             DAG.getConstant(CC, MVT::i8), Cond);
15148          // Zero extend the condition if needed.
15149          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
15150                             Cond);
15151          // Scale the condition by the difference.
15152          if (Diff != 1)
15153            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
15154                               DAG.getConstant(Diff, Cond.getValueType()));
15155
15156          // Add the base if non-zero.
15157          if (FalseC->getAPIntValue() != 0)
15158            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
15159                               SDValue(FalseC, 0));
15160          if (N->getNumValues() == 2)  // Dead flag value?
15161            return DCI.CombineTo(N, Cond, SDValue());
15162          return Cond;
15163        }
15164      }
15165    }
15166  }
15167
15168  // Handle these cases:
15169  //   (select (x != c), e, c) -> select (x != c), e, x),
15170  //   (select (x == c), c, e) -> select (x == c), x, e)
15171  // where the c is an integer constant, and the "select" is the combination
15172  // of CMOV and CMP.
15173  //
15174  // The rationale for this change is that the conditional-move from a constant
15175  // needs two instructions, however, conditional-move from a register needs
15176  // only one instruction.
15177  //
15178  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
15179  //  some instruction-combining opportunities. This opt needs to be
15180  //  postponed as late as possible.
15181  //
15182  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
15183    // the DCI.xxxx conditions are provided to postpone the optimization as
15184    // late as possible.
15185
15186    ConstantSDNode *CmpAgainst = 0;
15187    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
15188        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
15189        dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
15190
15191      if (CC == X86::COND_NE &&
15192          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
15193        CC = X86::GetOppositeBranchCondition(CC);
15194        std::swap(TrueOp, FalseOp);
15195      }
15196
15197      if (CC == X86::COND_E &&
15198          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
15199        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
15200                          DAG.getConstant(CC, MVT::i8), Cond };
15201        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
15202                           array_lengthof(Ops));
15203      }
15204    }
15205  }
15206
15207  return SDValue();
15208}
15209
15210
15211/// PerformMulCombine - Optimize a single multiply with constant into two
15212/// in order to implement it with two cheaper instructions, e.g.
15213/// LEA + SHL, LEA + LEA.
15214static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
15215                                 TargetLowering::DAGCombinerInfo &DCI) {
15216  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15217    return SDValue();
15218
15219  EVT VT = N->getValueType(0);
15220  if (VT != MVT::i64)
15221    return SDValue();
15222
15223  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
15224  if (!C)
15225    return SDValue();
15226  uint64_t MulAmt = C->getZExtValue();
15227  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
15228    return SDValue();
15229
15230  uint64_t MulAmt1 = 0;
15231  uint64_t MulAmt2 = 0;
15232  if ((MulAmt % 9) == 0) {
15233    MulAmt1 = 9;
15234    MulAmt2 = MulAmt / 9;
15235  } else if ((MulAmt % 5) == 0) {
15236    MulAmt1 = 5;
15237    MulAmt2 = MulAmt / 5;
15238  } else if ((MulAmt % 3) == 0) {
15239    MulAmt1 = 3;
15240    MulAmt2 = MulAmt / 3;
15241  }
15242  if (MulAmt2 &&
15243      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
15244    DebugLoc DL = N->getDebugLoc();
15245
15246    if (isPowerOf2_64(MulAmt2) &&
15247        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
15248      // If second multiplifer is pow2, issue it first. We want the multiply by
15249      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
15250      // is an add.
15251      std::swap(MulAmt1, MulAmt2);
15252
15253    SDValue NewMul;
15254    if (isPowerOf2_64(MulAmt1))
15255      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
15256                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
15257    else
15258      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
15259                           DAG.getConstant(MulAmt1, VT));
15260
15261    if (isPowerOf2_64(MulAmt2))
15262      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
15263                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
15264    else
15265      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
15266                           DAG.getConstant(MulAmt2, VT));
15267
15268    // Do not add new nodes to DAG combiner worklist.
15269    DCI.CombineTo(N, NewMul, false);
15270  }
15271  return SDValue();
15272}
15273
15274static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
15275  SDValue N0 = N->getOperand(0);
15276  SDValue N1 = N->getOperand(1);
15277  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
15278  EVT VT = N0.getValueType();
15279
15280  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
15281  // since the result of setcc_c is all zero's or all ones.
15282  if (VT.isInteger() && !VT.isVector() &&
15283      N1C && N0.getOpcode() == ISD::AND &&
15284      N0.getOperand(1).getOpcode() == ISD::Constant) {
15285    SDValue N00 = N0.getOperand(0);
15286    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
15287        ((N00.getOpcode() == ISD::ANY_EXTEND ||
15288          N00.getOpcode() == ISD::ZERO_EXTEND) &&
15289         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
15290      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
15291      APInt ShAmt = N1C->getAPIntValue();
15292      Mask = Mask.shl(ShAmt);
15293      if (Mask != 0)
15294        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
15295                           N00, DAG.getConstant(Mask, VT));
15296    }
15297  }
15298
15299
15300  // Hardware support for vector shifts is sparse which makes us scalarize the
15301  // vector operations in many cases. Also, on sandybridge ADD is faster than
15302  // shl.
15303  // (shl V, 1) -> add V,V
15304  if (isSplatVector(N1.getNode())) {
15305    assert(N0.getValueType().isVector() && "Invalid vector shift type");
15306    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
15307    // We shift all of the values by one. In many cases we do not have
15308    // hardware support for this operation. This is better expressed as an ADD
15309    // of two values.
15310    if (N1C && (1 == N1C->getZExtValue())) {
15311      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
15312    }
15313  }
15314
15315  return SDValue();
15316}
15317
15318/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
15319///                       when possible.
15320static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
15321                                   TargetLowering::DAGCombinerInfo &DCI,
15322                                   const X86Subtarget *Subtarget) {
15323  EVT VT = N->getValueType(0);
15324  if (N->getOpcode() == ISD::SHL) {
15325    SDValue V = PerformSHLCombine(N, DAG);
15326    if (V.getNode()) return V;
15327  }
15328
15329  // On X86 with SSE2 support, we can transform this to a vector shift if
15330  // all elements are shifted by the same amount.  We can't do this in legalize
15331  // because the a constant vector is typically transformed to a constant pool
15332  // so we have no knowledge of the shift amount.
15333  if (!Subtarget->hasSSE2())
15334    return SDValue();
15335
15336  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
15337      (!Subtarget->hasInt256() ||
15338       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
15339    return SDValue();
15340
15341  SDValue ShAmtOp = N->getOperand(1);
15342  EVT EltVT = VT.getVectorElementType();
15343  DebugLoc DL = N->getDebugLoc();
15344  SDValue BaseShAmt = SDValue();
15345  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
15346    unsigned NumElts = VT.getVectorNumElements();
15347    unsigned i = 0;
15348    for (; i != NumElts; ++i) {
15349      SDValue Arg = ShAmtOp.getOperand(i);
15350      if (Arg.getOpcode() == ISD::UNDEF) continue;
15351      BaseShAmt = Arg;
15352      break;
15353    }
15354    // Handle the case where the build_vector is all undef
15355    // FIXME: Should DAG allow this?
15356    if (i == NumElts)
15357      return SDValue();
15358
15359    for (; i != NumElts; ++i) {
15360      SDValue Arg = ShAmtOp.getOperand(i);
15361      if (Arg.getOpcode() == ISD::UNDEF) continue;
15362      if (Arg != BaseShAmt) {
15363        return SDValue();
15364      }
15365    }
15366  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
15367             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
15368    SDValue InVec = ShAmtOp.getOperand(0);
15369    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
15370      unsigned NumElts = InVec.getValueType().getVectorNumElements();
15371      unsigned i = 0;
15372      for (; i != NumElts; ++i) {
15373        SDValue Arg = InVec.getOperand(i);
15374        if (Arg.getOpcode() == ISD::UNDEF) continue;
15375        BaseShAmt = Arg;
15376        break;
15377      }
15378    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15379       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
15380         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
15381         if (C->getZExtValue() == SplatIdx)
15382           BaseShAmt = InVec.getOperand(1);
15383       }
15384    }
15385    if (BaseShAmt.getNode() == 0) {
15386      // Don't create instructions with illegal types after legalize
15387      // types has run.
15388      if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
15389          !DCI.isBeforeLegalize())
15390        return SDValue();
15391
15392      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
15393                              DAG.getIntPtrConstant(0));
15394    }
15395  } else
15396    return SDValue();
15397
15398  // The shift amount is an i32.
15399  if (EltVT.bitsGT(MVT::i32))
15400    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
15401  else if (EltVT.bitsLT(MVT::i32))
15402    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
15403
15404  // The shift amount is identical so we can do a vector shift.
15405  SDValue  ValOp = N->getOperand(0);
15406  switch (N->getOpcode()) {
15407  default:
15408    llvm_unreachable("Unknown shift opcode!");
15409  case ISD::SHL:
15410    switch (VT.getSimpleVT().SimpleTy) {
15411    default: return SDValue();
15412    case MVT::v2i64:
15413    case MVT::v4i32:
15414    case MVT::v8i16:
15415    case MVT::v4i64:
15416    case MVT::v8i32:
15417    case MVT::v16i16:
15418      return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
15419    }
15420  case ISD::SRA:
15421    switch (VT.getSimpleVT().SimpleTy) {
15422    default: return SDValue();
15423    case MVT::v4i32:
15424    case MVT::v8i16:
15425    case MVT::v8i32:
15426    case MVT::v16i16:
15427      return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
15428    }
15429  case ISD::SRL:
15430    switch (VT.getSimpleVT().SimpleTy) {
15431    default: return SDValue();
15432    case MVT::v2i64:
15433    case MVT::v4i32:
15434    case MVT::v8i16:
15435    case MVT::v4i64:
15436    case MVT::v8i32:
15437    case MVT::v16i16:
15438      return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
15439    }
15440  }
15441}
15442
15443
15444// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
15445// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
15446// and friends.  Likewise for OR -> CMPNEQSS.
15447static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
15448                            TargetLowering::DAGCombinerInfo &DCI,
15449                            const X86Subtarget *Subtarget) {
15450  unsigned opcode;
15451
15452  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
15453  // we're requiring SSE2 for both.
15454  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
15455    SDValue N0 = N->getOperand(0);
15456    SDValue N1 = N->getOperand(1);
15457    SDValue CMP0 = N0->getOperand(1);
15458    SDValue CMP1 = N1->getOperand(1);
15459    DebugLoc DL = N->getDebugLoc();
15460
15461    // The SETCCs should both refer to the same CMP.
15462    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
15463      return SDValue();
15464
15465    SDValue CMP00 = CMP0->getOperand(0);
15466    SDValue CMP01 = CMP0->getOperand(1);
15467    EVT     VT    = CMP00.getValueType();
15468
15469    if (VT == MVT::f32 || VT == MVT::f64) {
15470      bool ExpectingFlags = false;
15471      // Check for any users that want flags:
15472      for (SDNode::use_iterator UI = N->use_begin(),
15473             UE = N->use_end();
15474           !ExpectingFlags && UI != UE; ++UI)
15475        switch (UI->getOpcode()) {
15476        default:
15477        case ISD::BR_CC:
15478        case ISD::BRCOND:
15479        case ISD::SELECT:
15480          ExpectingFlags = true;
15481          break;
15482        case ISD::CopyToReg:
15483        case ISD::SIGN_EXTEND:
15484        case ISD::ZERO_EXTEND:
15485        case ISD::ANY_EXTEND:
15486          break;
15487        }
15488
15489      if (!ExpectingFlags) {
15490        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
15491        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
15492
15493        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
15494          X86::CondCode tmp = cc0;
15495          cc0 = cc1;
15496          cc1 = tmp;
15497        }
15498
15499        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
15500            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
15501          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
15502          X86ISD::NodeType NTOperator = is64BitFP ?
15503            X86ISD::FSETCCsd : X86ISD::FSETCCss;
15504          // FIXME: need symbolic constants for these magic numbers.
15505          // See X86ATTInstPrinter.cpp:printSSECC().
15506          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
15507          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
15508                                              DAG.getConstant(x86cc, MVT::i8));
15509          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
15510                                              OnesOrZeroesF);
15511          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
15512                                      DAG.getConstant(1, MVT::i32));
15513          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
15514          return OneBitOfTruth;
15515        }
15516      }
15517    }
15518  }
15519  return SDValue();
15520}
15521
15522/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
15523/// so it can be folded inside ANDNP.
15524static bool CanFoldXORWithAllOnes(const SDNode *N) {
15525  EVT VT = N->getValueType(0);
15526
15527  // Match direct AllOnes for 128 and 256-bit vectors
15528  if (ISD::isBuildVectorAllOnes(N))
15529    return true;
15530
15531  // Look through a bit convert.
15532  if (N->getOpcode() == ISD::BITCAST)
15533    N = N->getOperand(0).getNode();
15534
15535  // Sometimes the operand may come from a insert_subvector building a 256-bit
15536  // allones vector
15537  if (VT.is256BitVector() &&
15538      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
15539    SDValue V1 = N->getOperand(0);
15540    SDValue V2 = N->getOperand(1);
15541
15542    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
15543        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
15544        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
15545        ISD::isBuildVectorAllOnes(V2.getNode()))
15546      return true;
15547  }
15548
15549  return false;
15550}
15551
15552static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
15553                                 TargetLowering::DAGCombinerInfo &DCI,
15554                                 const X86Subtarget *Subtarget) {
15555  if (DCI.isBeforeLegalizeOps())
15556    return SDValue();
15557
15558  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
15559  if (R.getNode())
15560    return R;
15561
15562  EVT VT = N->getValueType(0);
15563
15564  // Create ANDN, BLSI, and BLSR instructions
15565  // BLSI is X & (-X)
15566  // BLSR is X & (X-1)
15567  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
15568    SDValue N0 = N->getOperand(0);
15569    SDValue N1 = N->getOperand(1);
15570    DebugLoc DL = N->getDebugLoc();
15571
15572    // Check LHS for not
15573    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
15574      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
15575    // Check RHS for not
15576    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
15577      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
15578
15579    // Check LHS for neg
15580    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
15581        isZero(N0.getOperand(0)))
15582      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
15583
15584    // Check RHS for neg
15585    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
15586        isZero(N1.getOperand(0)))
15587      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
15588
15589    // Check LHS for X-1
15590    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
15591        isAllOnes(N0.getOperand(1)))
15592      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
15593
15594    // Check RHS for X-1
15595    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
15596        isAllOnes(N1.getOperand(1)))
15597      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
15598
15599    return SDValue();
15600  }
15601
15602  // Want to form ANDNP nodes:
15603  // 1) In the hopes of then easily combining them with OR and AND nodes
15604  //    to form PBLEND/PSIGN.
15605  // 2) To match ANDN packed intrinsics
15606  if (VT != MVT::v2i64 && VT != MVT::v4i64)
15607    return SDValue();
15608
15609  SDValue N0 = N->getOperand(0);
15610  SDValue N1 = N->getOperand(1);
15611  DebugLoc DL = N->getDebugLoc();
15612
15613  // Check LHS for vnot
15614  if (N0.getOpcode() == ISD::XOR &&
15615      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
15616      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
15617    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
15618
15619  // Check RHS for vnot
15620  if (N1.getOpcode() == ISD::XOR &&
15621      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
15622      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
15623    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
15624
15625  return SDValue();
15626}
15627
15628static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
15629                                TargetLowering::DAGCombinerInfo &DCI,
15630                                const X86Subtarget *Subtarget) {
15631  if (DCI.isBeforeLegalizeOps())
15632    return SDValue();
15633
15634  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
15635  if (R.getNode())
15636    return R;
15637
15638  EVT VT = N->getValueType(0);
15639
15640  SDValue N0 = N->getOperand(0);
15641  SDValue N1 = N->getOperand(1);
15642
15643  // look for psign/blend
15644  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
15645    if (!Subtarget->hasSSSE3() ||
15646        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
15647      return SDValue();
15648
15649    // Canonicalize pandn to RHS
15650    if (N0.getOpcode() == X86ISD::ANDNP)
15651      std::swap(N0, N1);
15652    // or (and (m, y), (pandn m, x))
15653    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
15654      SDValue Mask = N1.getOperand(0);
15655      SDValue X    = N1.getOperand(1);
15656      SDValue Y;
15657      if (N0.getOperand(0) == Mask)
15658        Y = N0.getOperand(1);
15659      if (N0.getOperand(1) == Mask)
15660        Y = N0.getOperand(0);
15661
15662      // Check to see if the mask appeared in both the AND and ANDNP and
15663      if (!Y.getNode())
15664        return SDValue();
15665
15666      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
15667      // Look through mask bitcast.
15668      if (Mask.getOpcode() == ISD::BITCAST)
15669        Mask = Mask.getOperand(0);
15670      if (X.getOpcode() == ISD::BITCAST)
15671        X = X.getOperand(0);
15672      if (Y.getOpcode() == ISD::BITCAST)
15673        Y = Y.getOperand(0);
15674
15675      EVT MaskVT = Mask.getValueType();
15676
15677      // Validate that the Mask operand is a vector sra node.
15678      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
15679      // there is no psrai.b
15680      if (Mask.getOpcode() != X86ISD::VSRAI)
15681        return SDValue();
15682
15683      // Check that the SRA is all signbits.
15684      SDValue SraC = Mask.getOperand(1);
15685      unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
15686      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
15687      if ((SraAmt + 1) != EltBits)
15688        return SDValue();
15689
15690      DebugLoc DL = N->getDebugLoc();
15691
15692      // We are going to replace the AND, OR, NAND with either BLEND
15693      // or PSIGN, which only look at the MSB. The VSRAI instruction
15694      // does not affect the highest bit, so we can get rid of it.
15695      Mask = Mask.getOperand(0);
15696
15697      // Now we know we at least have a plendvb with the mask val.  See if
15698      // we can form a psignb/w/d.
15699      // psign = x.type == y.type == mask.type && y = sub(0, x);
15700      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
15701          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
15702          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
15703        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
15704               "Unsupported VT for PSIGN");
15705        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask);
15706        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
15707      }
15708      // PBLENDVB only available on SSE 4.1
15709      if (!Subtarget->hasSSE41())
15710        return SDValue();
15711
15712      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
15713
15714      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
15715      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
15716      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
15717      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
15718      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
15719    }
15720  }
15721
15722  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
15723    return SDValue();
15724
15725  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
15726  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
15727    std::swap(N0, N1);
15728  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
15729    return SDValue();
15730  if (!N0.hasOneUse() || !N1.hasOneUse())
15731    return SDValue();
15732
15733  SDValue ShAmt0 = N0.getOperand(1);
15734  if (ShAmt0.getValueType() != MVT::i8)
15735    return SDValue();
15736  SDValue ShAmt1 = N1.getOperand(1);
15737  if (ShAmt1.getValueType() != MVT::i8)
15738    return SDValue();
15739  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
15740    ShAmt0 = ShAmt0.getOperand(0);
15741  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
15742    ShAmt1 = ShAmt1.getOperand(0);
15743
15744  DebugLoc DL = N->getDebugLoc();
15745  unsigned Opc = X86ISD::SHLD;
15746  SDValue Op0 = N0.getOperand(0);
15747  SDValue Op1 = N1.getOperand(0);
15748  if (ShAmt0.getOpcode() == ISD::SUB) {
15749    Opc = X86ISD::SHRD;
15750    std::swap(Op0, Op1);
15751    std::swap(ShAmt0, ShAmt1);
15752  }
15753
15754  unsigned Bits = VT.getSizeInBits();
15755  if (ShAmt1.getOpcode() == ISD::SUB) {
15756    SDValue Sum = ShAmt1.getOperand(0);
15757    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
15758      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
15759      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
15760        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
15761      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
15762        return DAG.getNode(Opc, DL, VT,
15763                           Op0, Op1,
15764                           DAG.getNode(ISD::TRUNCATE, DL,
15765                                       MVT::i8, ShAmt0));
15766    }
15767  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
15768    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
15769    if (ShAmt0C &&
15770        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
15771      return DAG.getNode(Opc, DL, VT,
15772                         N0.getOperand(0), N1.getOperand(0),
15773                         DAG.getNode(ISD::TRUNCATE, DL,
15774                                       MVT::i8, ShAmt0));
15775  }
15776
15777  return SDValue();
15778}
15779
15780// Generate NEG and CMOV for integer abs.
15781static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
15782  EVT VT = N->getValueType(0);
15783
15784  // Since X86 does not have CMOV for 8-bit integer, we don't convert
15785  // 8-bit integer abs to NEG and CMOV.
15786  if (VT.isInteger() && VT.getSizeInBits() == 8)
15787    return SDValue();
15788
15789  SDValue N0 = N->getOperand(0);
15790  SDValue N1 = N->getOperand(1);
15791  DebugLoc DL = N->getDebugLoc();
15792
15793  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
15794  // and change it to SUB and CMOV.
15795  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
15796      N0.getOpcode() == ISD::ADD &&
15797      N0.getOperand(1) == N1 &&
15798      N1.getOpcode() == ISD::SRA &&
15799      N1.getOperand(0) == N0.getOperand(0))
15800    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
15801      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
15802        // Generate SUB & CMOV.
15803        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
15804                                  DAG.getConstant(0, VT), N0.getOperand(0));
15805
15806        SDValue Ops[] = { N0.getOperand(0), Neg,
15807                          DAG.getConstant(X86::COND_GE, MVT::i8),
15808                          SDValue(Neg.getNode(), 1) };
15809        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
15810                           Ops, array_lengthof(Ops));
15811      }
15812  return SDValue();
15813}
15814
15815// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
15816static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
15817                                 TargetLowering::DAGCombinerInfo &DCI,
15818                                 const X86Subtarget *Subtarget) {
15819  if (DCI.isBeforeLegalizeOps())
15820    return SDValue();
15821
15822  if (Subtarget->hasCMov()) {
15823    SDValue RV = performIntegerAbsCombine(N, DAG);
15824    if (RV.getNode())
15825      return RV;
15826  }
15827
15828  // Try forming BMI if it is available.
15829  if (!Subtarget->hasBMI())
15830    return SDValue();
15831
15832  EVT VT = N->getValueType(0);
15833
15834  if (VT != MVT::i32 && VT != MVT::i64)
15835    return SDValue();
15836
15837  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
15838
15839  // Create BLSMSK instructions by finding X ^ (X-1)
15840  SDValue N0 = N->getOperand(0);
15841  SDValue N1 = N->getOperand(1);
15842  DebugLoc DL = N->getDebugLoc();
15843
15844  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
15845      isAllOnes(N0.getOperand(1)))
15846    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
15847
15848  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
15849      isAllOnes(N1.getOperand(1)))
15850    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
15851
15852  return SDValue();
15853}
15854
15855/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
15856static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
15857                                  TargetLowering::DAGCombinerInfo &DCI,
15858                                  const X86Subtarget *Subtarget) {
15859  LoadSDNode *Ld = cast<LoadSDNode>(N);
15860  EVT RegVT = Ld->getValueType(0);
15861  EVT MemVT = Ld->getMemoryVT();
15862  DebugLoc dl = Ld->getDebugLoc();
15863  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15864
15865  ISD::LoadExtType Ext = Ld->getExtensionType();
15866
15867  // If this is a vector EXT Load then attempt to optimize it using a
15868  // shuffle. We need SSSE3 shuffles.
15869  // TODO: It is possible to support ZExt by zeroing the undef values
15870  // during the shuffle phase or after the shuffle.
15871  if (RegVT.isVector() && RegVT.isInteger() &&
15872      Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
15873    assert(MemVT != RegVT && "Cannot extend to the same type");
15874    assert(MemVT.isVector() && "Must load a vector from memory");
15875
15876    unsigned NumElems = RegVT.getVectorNumElements();
15877    unsigned RegSz = RegVT.getSizeInBits();
15878    unsigned MemSz = MemVT.getSizeInBits();
15879    assert(RegSz > MemSz && "Register size must be greater than the mem size");
15880
15881    // All sizes must be a power of two.
15882    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
15883      return SDValue();
15884
15885    // Attempt to load the original value using scalar loads.
15886    // Find the largest scalar type that divides the total loaded size.
15887    MVT SclrLoadTy = MVT::i8;
15888    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
15889         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
15890      MVT Tp = (MVT::SimpleValueType)tp;
15891      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
15892        SclrLoadTy = Tp;
15893      }
15894    }
15895
15896    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
15897    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
15898        (64 <= MemSz))
15899      SclrLoadTy = MVT::f64;
15900
15901    // Calculate the number of scalar loads that we need to perform
15902    // in order to load our vector from memory.
15903    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
15904
15905    // Represent our vector as a sequence of elements which are the
15906    // largest scalar that we can load.
15907    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
15908      RegSz/SclrLoadTy.getSizeInBits());
15909
15910    // Represent the data using the same element type that is stored in
15911    // memory. In practice, we ''widen'' MemVT.
15912    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
15913                                  RegSz/MemVT.getScalarType().getSizeInBits());
15914
15915    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
15916      "Invalid vector type");
15917
15918    // We can't shuffle using an illegal type.
15919    if (!TLI.isTypeLegal(WideVecVT))
15920      return SDValue();
15921
15922    SmallVector<SDValue, 8> Chains;
15923    SDValue Ptr = Ld->getBasePtr();
15924    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
15925                                        TLI.getPointerTy());
15926    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
15927
15928    for (unsigned i = 0; i < NumLoads; ++i) {
15929      // Perform a single load.
15930      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
15931                                       Ptr, Ld->getPointerInfo(),
15932                                       Ld->isVolatile(), Ld->isNonTemporal(),
15933                                       Ld->isInvariant(), Ld->getAlignment());
15934      Chains.push_back(ScalarLoad.getValue(1));
15935      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
15936      // another round of DAGCombining.
15937      if (i == 0)
15938        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
15939      else
15940        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
15941                          ScalarLoad, DAG.getIntPtrConstant(i));
15942
15943      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
15944    }
15945
15946    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
15947                               Chains.size());
15948
15949    // Bitcast the loaded value to a vector of the original element type, in
15950    // the size of the target vector type.
15951    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
15952    unsigned SizeRatio = RegSz/MemSz;
15953
15954    // Redistribute the loaded elements into the different locations.
15955    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
15956    for (unsigned i = 0; i != NumElems; ++i)
15957      ShuffleVec[i*SizeRatio] = i;
15958
15959    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
15960                                         DAG.getUNDEF(WideVecVT),
15961                                         &ShuffleVec[0]);
15962
15963    // Bitcast to the requested type.
15964    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
15965    // Replace the original load with the new sequence
15966    // and return the new chain.
15967    return DCI.CombineTo(N, Shuff, TF, true);
15968  }
15969
15970  return SDValue();
15971}
15972
15973/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
15974static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
15975                                   const X86Subtarget *Subtarget) {
15976  StoreSDNode *St = cast<StoreSDNode>(N);
15977  EVT VT = St->getValue().getValueType();
15978  EVT StVT = St->getMemoryVT();
15979  DebugLoc dl = St->getDebugLoc();
15980  SDValue StoredVal = St->getOperand(1);
15981  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15982
15983  // If we are saving a concatenation of two XMM registers, perform two stores.
15984  // On Sandy Bridge, 256-bit memory operations are executed by two
15985  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
15986  // memory  operation.
15987  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
15988      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
15989      StoredVal.getNumOperands() == 2) {
15990    SDValue Value0 = StoredVal.getOperand(0);
15991    SDValue Value1 = StoredVal.getOperand(1);
15992
15993    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
15994    SDValue Ptr0 = St->getBasePtr();
15995    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
15996
15997    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
15998                                St->getPointerInfo(), St->isVolatile(),
15999                                St->isNonTemporal(), St->getAlignment());
16000    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
16001                                St->getPointerInfo(), St->isVolatile(),
16002                                St->isNonTemporal(), St->getAlignment());
16003    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
16004  }
16005
16006  // Optimize trunc store (of multiple scalars) to shuffle and store.
16007  // First, pack all of the elements in one place. Next, store to memory
16008  // in fewer chunks.
16009  if (St->isTruncatingStore() && VT.isVector()) {
16010    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16011    unsigned NumElems = VT.getVectorNumElements();
16012    assert(StVT != VT && "Cannot truncate to the same type");
16013    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
16014    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
16015
16016    // From, To sizes and ElemCount must be pow of two
16017    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
16018    // We are going to use the original vector elt for storing.
16019    // Accumulated smaller vector elements must be a multiple of the store size.
16020    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
16021
16022    unsigned SizeRatio  = FromSz / ToSz;
16023
16024    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
16025
16026    // Create a type on which we perform the shuffle
16027    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
16028            StVT.getScalarType(), NumElems*SizeRatio);
16029
16030    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16031
16032    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
16033    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16034    for (unsigned i = 0; i != NumElems; ++i)
16035      ShuffleVec[i] = i * SizeRatio;
16036
16037    // Can't shuffle using an illegal type.
16038    if (!TLI.isTypeLegal(WideVecVT))
16039      return SDValue();
16040
16041    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
16042                                         DAG.getUNDEF(WideVecVT),
16043                                         &ShuffleVec[0]);
16044    // At this point all of the data is stored at the bottom of the
16045    // register. We now need to save it to mem.
16046
16047    // Find the largest store unit
16048    MVT StoreType = MVT::i8;
16049    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
16050         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
16051      MVT Tp = (MVT::SimpleValueType)tp;
16052      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
16053        StoreType = Tp;
16054    }
16055
16056    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16057    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
16058        (64 <= NumElems * ToSz))
16059      StoreType = MVT::f64;
16060
16061    // Bitcast the original vector into a vector of store-size units
16062    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
16063            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
16064    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16065    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
16066    SmallVector<SDValue, 8> Chains;
16067    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
16068                                        TLI.getPointerTy());
16069    SDValue Ptr = St->getBasePtr();
16070
16071    // Perform one or more big stores into memory.
16072    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
16073      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
16074                                   StoreType, ShuffWide,
16075                                   DAG.getIntPtrConstant(i));
16076      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
16077                                St->getPointerInfo(), St->isVolatile(),
16078                                St->isNonTemporal(), St->getAlignment());
16079      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16080      Chains.push_back(Ch);
16081    }
16082
16083    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
16084                               Chains.size());
16085  }
16086
16087
16088  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
16089  // the FP state in cases where an emms may be missing.
16090  // A preferable solution to the general problem is to figure out the right
16091  // places to insert EMMS.  This qualifies as a quick hack.
16092
16093  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
16094  if (VT.getSizeInBits() != 64)
16095    return SDValue();
16096
16097  const Function *F = DAG.getMachineFunction().getFunction();
16098  bool NoImplicitFloatOps = F->getFnAttributes().
16099    hasAttribute(Attributes::NoImplicitFloat);
16100  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
16101                     && Subtarget->hasSSE2();
16102  if ((VT.isVector() ||
16103       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
16104      isa<LoadSDNode>(St->getValue()) &&
16105      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
16106      St->getChain().hasOneUse() && !St->isVolatile()) {
16107    SDNode* LdVal = St->getValue().getNode();
16108    LoadSDNode *Ld = 0;
16109    int TokenFactorIndex = -1;
16110    SmallVector<SDValue, 8> Ops;
16111    SDNode* ChainVal = St->getChain().getNode();
16112    // Must be a store of a load.  We currently handle two cases:  the load
16113    // is a direct child, and it's under an intervening TokenFactor.  It is
16114    // possible to dig deeper under nested TokenFactors.
16115    if (ChainVal == LdVal)
16116      Ld = cast<LoadSDNode>(St->getChain());
16117    else if (St->getValue().hasOneUse() &&
16118             ChainVal->getOpcode() == ISD::TokenFactor) {
16119      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
16120        if (ChainVal->getOperand(i).getNode() == LdVal) {
16121          TokenFactorIndex = i;
16122          Ld = cast<LoadSDNode>(St->getValue());
16123        } else
16124          Ops.push_back(ChainVal->getOperand(i));
16125      }
16126    }
16127
16128    if (!Ld || !ISD::isNormalLoad(Ld))
16129      return SDValue();
16130
16131    // If this is not the MMX case, i.e. we are just turning i64 load/store
16132    // into f64 load/store, avoid the transformation if there are multiple
16133    // uses of the loaded value.
16134    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
16135      return SDValue();
16136
16137    DebugLoc LdDL = Ld->getDebugLoc();
16138    DebugLoc StDL = N->getDebugLoc();
16139    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
16140    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
16141    // pair instead.
16142    if (Subtarget->is64Bit() || F64IsLegal) {
16143      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
16144      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
16145                                  Ld->getPointerInfo(), Ld->isVolatile(),
16146                                  Ld->isNonTemporal(), Ld->isInvariant(),
16147                                  Ld->getAlignment());
16148      SDValue NewChain = NewLd.getValue(1);
16149      if (TokenFactorIndex != -1) {
16150        Ops.push_back(NewChain);
16151        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
16152                               Ops.size());
16153      }
16154      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
16155                          St->getPointerInfo(),
16156                          St->isVolatile(), St->isNonTemporal(),
16157                          St->getAlignment());
16158    }
16159
16160    // Otherwise, lower to two pairs of 32-bit loads / stores.
16161    SDValue LoAddr = Ld->getBasePtr();
16162    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
16163                                 DAG.getConstant(4, MVT::i32));
16164
16165    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
16166                               Ld->getPointerInfo(),
16167                               Ld->isVolatile(), Ld->isNonTemporal(),
16168                               Ld->isInvariant(), Ld->getAlignment());
16169    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
16170                               Ld->getPointerInfo().getWithOffset(4),
16171                               Ld->isVolatile(), Ld->isNonTemporal(),
16172                               Ld->isInvariant(),
16173                               MinAlign(Ld->getAlignment(), 4));
16174
16175    SDValue NewChain = LoLd.getValue(1);
16176    if (TokenFactorIndex != -1) {
16177      Ops.push_back(LoLd);
16178      Ops.push_back(HiLd);
16179      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
16180                             Ops.size());
16181    }
16182
16183    LoAddr = St->getBasePtr();
16184    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
16185                         DAG.getConstant(4, MVT::i32));
16186
16187    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
16188                                St->getPointerInfo(),
16189                                St->isVolatile(), St->isNonTemporal(),
16190                                St->getAlignment());
16191    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
16192                                St->getPointerInfo().getWithOffset(4),
16193                                St->isVolatile(),
16194                                St->isNonTemporal(),
16195                                MinAlign(St->getAlignment(), 4));
16196    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
16197  }
16198  return SDValue();
16199}
16200
16201/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
16202/// and return the operands for the horizontal operation in LHS and RHS.  A
16203/// horizontal operation performs the binary operation on successive elements
16204/// of its first operand, then on successive elements of its second operand,
16205/// returning the resulting values in a vector.  For example, if
16206///   A = < float a0, float a1, float a2, float a3 >
16207/// and
16208///   B = < float b0, float b1, float b2, float b3 >
16209/// then the result of doing a horizontal operation on A and B is
16210///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
16211/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
16212/// A horizontal-op B, for some already available A and B, and if so then LHS is
16213/// set to A, RHS to B, and the routine returns 'true'.
16214/// Note that the binary operation should have the property that if one of the
16215/// operands is UNDEF then the result is UNDEF.
16216static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
16217  // Look for the following pattern: if
16218  //   A = < float a0, float a1, float a2, float a3 >
16219  //   B = < float b0, float b1, float b2, float b3 >
16220  // and
16221  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
16222  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
16223  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
16224  // which is A horizontal-op B.
16225
16226  // At least one of the operands should be a vector shuffle.
16227  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
16228      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
16229    return false;
16230
16231  EVT VT = LHS.getValueType();
16232
16233  assert((VT.is128BitVector() || VT.is256BitVector()) &&
16234         "Unsupported vector type for horizontal add/sub");
16235
16236  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
16237  // operate independently on 128-bit lanes.
16238  unsigned NumElts = VT.getVectorNumElements();
16239  unsigned NumLanes = VT.getSizeInBits()/128;
16240  unsigned NumLaneElts = NumElts / NumLanes;
16241  assert((NumLaneElts % 2 == 0) &&
16242         "Vector type should have an even number of elements in each lane");
16243  unsigned HalfLaneElts = NumLaneElts/2;
16244
16245  // View LHS in the form
16246  //   LHS = VECTOR_SHUFFLE A, B, LMask
16247  // If LHS is not a shuffle then pretend it is the shuffle
16248  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
16249  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
16250  // type VT.
16251  SDValue A, B;
16252  SmallVector<int, 16> LMask(NumElts);
16253  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
16254    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
16255      A = LHS.getOperand(0);
16256    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
16257      B = LHS.getOperand(1);
16258    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
16259    std::copy(Mask.begin(), Mask.end(), LMask.begin());
16260  } else {
16261    if (LHS.getOpcode() != ISD::UNDEF)
16262      A = LHS;
16263    for (unsigned i = 0; i != NumElts; ++i)
16264      LMask[i] = i;
16265  }
16266
16267  // Likewise, view RHS in the form
16268  //   RHS = VECTOR_SHUFFLE C, D, RMask
16269  SDValue C, D;
16270  SmallVector<int, 16> RMask(NumElts);
16271  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
16272    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
16273      C = RHS.getOperand(0);
16274    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
16275      D = RHS.getOperand(1);
16276    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
16277    std::copy(Mask.begin(), Mask.end(), RMask.begin());
16278  } else {
16279    if (RHS.getOpcode() != ISD::UNDEF)
16280      C = RHS;
16281    for (unsigned i = 0; i != NumElts; ++i)
16282      RMask[i] = i;
16283  }
16284
16285  // Check that the shuffles are both shuffling the same vectors.
16286  if (!(A == C && B == D) && !(A == D && B == C))
16287    return false;
16288
16289  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
16290  if (!A.getNode() && !B.getNode())
16291    return false;
16292
16293  // If A and B occur in reverse order in RHS, then "swap" them (which means
16294  // rewriting the mask).
16295  if (A != C)
16296    CommuteVectorShuffleMask(RMask, NumElts);
16297
16298  // At this point LHS and RHS are equivalent to
16299  //   LHS = VECTOR_SHUFFLE A, B, LMask
16300  //   RHS = VECTOR_SHUFFLE A, B, RMask
16301  // Check that the masks correspond to performing a horizontal operation.
16302  for (unsigned i = 0; i != NumElts; ++i) {
16303    int LIdx = LMask[i], RIdx = RMask[i];
16304
16305    // Ignore any UNDEF components.
16306    if (LIdx < 0 || RIdx < 0 ||
16307        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
16308        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
16309      continue;
16310
16311    // Check that successive elements are being operated on.  If not, this is
16312    // not a horizontal operation.
16313    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
16314    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
16315    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
16316    if (!(LIdx == Index && RIdx == Index + 1) &&
16317        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
16318      return false;
16319  }
16320
16321  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
16322  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
16323  return true;
16324}
16325
16326/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
16327static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
16328                                  const X86Subtarget *Subtarget) {
16329  EVT VT = N->getValueType(0);
16330  SDValue LHS = N->getOperand(0);
16331  SDValue RHS = N->getOperand(1);
16332
16333  // Try to synthesize horizontal adds from adds of shuffles.
16334  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
16335       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
16336      isHorizontalBinOp(LHS, RHS, true))
16337    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
16338  return SDValue();
16339}
16340
16341/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
16342static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
16343                                  const X86Subtarget *Subtarget) {
16344  EVT VT = N->getValueType(0);
16345  SDValue LHS = N->getOperand(0);
16346  SDValue RHS = N->getOperand(1);
16347
16348  // Try to synthesize horizontal subs from subs of shuffles.
16349  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
16350       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
16351      isHorizontalBinOp(LHS, RHS, false))
16352    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
16353  return SDValue();
16354}
16355
16356/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
16357/// X86ISD::FXOR nodes.
16358static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
16359  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
16360  // F[X]OR(0.0, x) -> x
16361  // F[X]OR(x, 0.0) -> x
16362  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
16363    if (C->getValueAPF().isPosZero())
16364      return N->getOperand(1);
16365  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
16366    if (C->getValueAPF().isPosZero())
16367      return N->getOperand(0);
16368  return SDValue();
16369}
16370
16371/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
16372/// X86ISD::FMAX nodes.
16373static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
16374  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
16375
16376  // Only perform optimizations if UnsafeMath is used.
16377  if (!DAG.getTarget().Options.UnsafeFPMath)
16378    return SDValue();
16379
16380  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
16381  // into FMINC and FMAXC, which are Commutative operations.
16382  unsigned NewOp = 0;
16383  switch (N->getOpcode()) {
16384    default: llvm_unreachable("unknown opcode");
16385    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
16386    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
16387  }
16388
16389  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
16390                     N->getOperand(0), N->getOperand(1));
16391}
16392
16393
16394/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
16395static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
16396  // FAND(0.0, x) -> 0.0
16397  // FAND(x, 0.0) -> 0.0
16398  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
16399    if (C->getValueAPF().isPosZero())
16400      return N->getOperand(0);
16401  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
16402    if (C->getValueAPF().isPosZero())
16403      return N->getOperand(1);
16404  return SDValue();
16405}
16406
16407static SDValue PerformBTCombine(SDNode *N,
16408                                SelectionDAG &DAG,
16409                                TargetLowering::DAGCombinerInfo &DCI) {
16410  // BT ignores high bits in the bit index operand.
16411  SDValue Op1 = N->getOperand(1);
16412  if (Op1.hasOneUse()) {
16413    unsigned BitWidth = Op1.getValueSizeInBits();
16414    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
16415    APInt KnownZero, KnownOne;
16416    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
16417                                          !DCI.isBeforeLegalizeOps());
16418    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16419    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
16420        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
16421      DCI.CommitTargetLoweringOpt(TLO);
16422  }
16423  return SDValue();
16424}
16425
16426static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
16427  SDValue Op = N->getOperand(0);
16428  if (Op.getOpcode() == ISD::BITCAST)
16429    Op = Op.getOperand(0);
16430  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
16431  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
16432      VT.getVectorElementType().getSizeInBits() ==
16433      OpVT.getVectorElementType().getSizeInBits()) {
16434    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
16435  }
16436  return SDValue();
16437}
16438
16439static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
16440                                  TargetLowering::DAGCombinerInfo &DCI,
16441                                  const X86Subtarget *Subtarget) {
16442  if (!DCI.isBeforeLegalizeOps())
16443    return SDValue();
16444
16445  if (!Subtarget->hasFp256())
16446    return SDValue();
16447
16448  EVT VT = N->getValueType(0);
16449  SDValue Op = N->getOperand(0);
16450  EVT OpVT = Op.getValueType();
16451  DebugLoc dl = N->getDebugLoc();
16452
16453  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
16454      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
16455
16456    if (Subtarget->hasInt256())
16457      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
16458
16459    // Optimize vectors in AVX mode
16460    // Sign extend  v8i16 to v8i32 and
16461    //              v4i32 to v4i64
16462    //
16463    // Divide input vector into two parts
16464    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16465    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16466    // concat the vectors to original VT
16467
16468    unsigned NumElems = OpVT.getVectorNumElements();
16469    SDValue Undef = DAG.getUNDEF(OpVT);
16470
16471    SmallVector<int,8> ShufMask1(NumElems, -1);
16472    for (unsigned i = 0; i != NumElems/2; ++i)
16473      ShufMask1[i] = i;
16474
16475    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
16476
16477    SmallVector<int,8> ShufMask2(NumElems, -1);
16478    for (unsigned i = 0; i != NumElems/2; ++i)
16479      ShufMask2[i] = i + NumElems/2;
16480
16481    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
16482
16483    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
16484                                  VT.getVectorNumElements()/2);
16485
16486    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
16487    OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
16488
16489    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16490  }
16491  return SDValue();
16492}
16493
16494static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
16495                                 const X86Subtarget* Subtarget) {
16496  DebugLoc dl = N->getDebugLoc();
16497  EVT VT = N->getValueType(0);
16498
16499  // Let legalize expand this if it isn't a legal type yet.
16500  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
16501    return SDValue();
16502
16503  EVT ScalarVT = VT.getScalarType();
16504  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
16505      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
16506    return SDValue();
16507
16508  SDValue A = N->getOperand(0);
16509  SDValue B = N->getOperand(1);
16510  SDValue C = N->getOperand(2);
16511
16512  bool NegA = (A.getOpcode() == ISD::FNEG);
16513  bool NegB = (B.getOpcode() == ISD::FNEG);
16514  bool NegC = (C.getOpcode() == ISD::FNEG);
16515
16516  // Negative multiplication when NegA xor NegB
16517  bool NegMul = (NegA != NegB);
16518  if (NegA)
16519    A = A.getOperand(0);
16520  if (NegB)
16521    B = B.getOperand(0);
16522  if (NegC)
16523    C = C.getOperand(0);
16524
16525  unsigned Opcode;
16526  if (!NegMul)
16527    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
16528  else
16529    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
16530
16531  return DAG.getNode(Opcode, dl, VT, A, B, C);
16532}
16533
16534static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
16535                                  TargetLowering::DAGCombinerInfo &DCI,
16536                                  const X86Subtarget *Subtarget) {
16537  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
16538  //           (and (i32 x86isd::setcc_carry), 1)
16539  // This eliminates the zext. This transformation is necessary because
16540  // ISD::SETCC is always legalized to i8.
16541  DebugLoc dl = N->getDebugLoc();
16542  SDValue N0 = N->getOperand(0);
16543  EVT VT = N->getValueType(0);
16544  EVT OpVT = N0.getValueType();
16545
16546  if (N0.getOpcode() == ISD::AND &&
16547      N0.hasOneUse() &&
16548      N0.getOperand(0).hasOneUse()) {
16549    SDValue N00 = N0.getOperand(0);
16550    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
16551      return SDValue();
16552    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
16553    if (!C || C->getZExtValue() != 1)
16554      return SDValue();
16555    return DAG.getNode(ISD::AND, dl, VT,
16556                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
16557                                   N00.getOperand(0), N00.getOperand(1)),
16558                       DAG.getConstant(1, VT));
16559  }
16560
16561  // Optimize vectors in AVX mode:
16562  //
16563  //   v8i16 -> v8i32
16564  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
16565  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
16566  //   Concat upper and lower parts.
16567  //
16568  //   v4i32 -> v4i64
16569  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
16570  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
16571  //   Concat upper and lower parts.
16572  //
16573  if (!DCI.isBeforeLegalizeOps())
16574    return SDValue();
16575
16576  if (!Subtarget->hasFp256())
16577    return SDValue();
16578
16579  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
16580      ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
16581
16582    if (Subtarget->hasInt256())
16583      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
16584
16585    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
16586    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
16587    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
16588
16589    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
16590                               VT.getVectorNumElements()/2);
16591
16592    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
16593    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
16594
16595    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16596  }
16597
16598  return SDValue();
16599}
16600
16601// Optimize x == -y --> x+y == 0
16602//          x != -y --> x+y != 0
16603static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
16604  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16605  SDValue LHS = N->getOperand(0);
16606  SDValue RHS = N->getOperand(1);
16607
16608  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
16609    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
16610      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
16611        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
16612                                   LHS.getValueType(), RHS, LHS.getOperand(1));
16613        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
16614                            addV, DAG.getConstant(0, addV.getValueType()), CC);
16615      }
16616  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
16617    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
16618      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
16619        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
16620                                   RHS.getValueType(), LHS, RHS.getOperand(1));
16621        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
16622                            addV, DAG.getConstant(0, addV.getValueType()), CC);
16623      }
16624  return SDValue();
16625}
16626
16627// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
16628// as "sbb reg,reg", since it can be extended without zext and produces
16629// an all-ones bit which is more useful than 0/1 in some cases.
16630static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
16631  return DAG.getNode(ISD::AND, DL, MVT::i8,
16632                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
16633                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
16634                     DAG.getConstant(1, MVT::i8));
16635}
16636
16637// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
16638static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
16639                                   TargetLowering::DAGCombinerInfo &DCI,
16640                                   const X86Subtarget *Subtarget) {
16641  DebugLoc DL = N->getDebugLoc();
16642  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
16643  SDValue EFLAGS = N->getOperand(1);
16644
16645  if (CC == X86::COND_A) {
16646    // Try to convert COND_A into COND_B in an attempt to facilitate
16647    // materializing "setb reg".
16648    //
16649    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
16650    // cannot take an immediate as its first operand.
16651    //
16652    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
16653        EFLAGS.getValueType().isInteger() &&
16654        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
16655      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
16656                                   EFLAGS.getNode()->getVTList(),
16657                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
16658      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
16659      return MaterializeSETB(DL, NewEFLAGS, DAG);
16660    }
16661  }
16662
16663  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
16664  // a zext and produces an all-ones bit which is more useful than 0/1 in some
16665  // cases.
16666  if (CC == X86::COND_B)
16667    return MaterializeSETB(DL, EFLAGS, DAG);
16668
16669  SDValue Flags;
16670
16671  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
16672  if (Flags.getNode()) {
16673    SDValue Cond = DAG.getConstant(CC, MVT::i8);
16674    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
16675  }
16676
16677  return SDValue();
16678}
16679
16680// Optimize branch condition evaluation.
16681//
16682static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
16683                                    TargetLowering::DAGCombinerInfo &DCI,
16684                                    const X86Subtarget *Subtarget) {
16685  DebugLoc DL = N->getDebugLoc();
16686  SDValue Chain = N->getOperand(0);
16687  SDValue Dest = N->getOperand(1);
16688  SDValue EFLAGS = N->getOperand(3);
16689  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
16690
16691  SDValue Flags;
16692
16693  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
16694  if (Flags.getNode()) {
16695    SDValue Cond = DAG.getConstant(CC, MVT::i8);
16696    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
16697                       Flags);
16698  }
16699
16700  return SDValue();
16701}
16702
16703static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
16704                                        const X86TargetLowering *XTLI) {
16705  SDValue Op0 = N->getOperand(0);
16706  EVT InVT = Op0->getValueType(0);
16707
16708  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
16709  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
16710    DebugLoc dl = N->getDebugLoc();
16711    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
16712    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
16713    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
16714  }
16715
16716  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
16717  // a 32-bit target where SSE doesn't support i64->FP operations.
16718  if (Op0.getOpcode() == ISD::LOAD) {
16719    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
16720    EVT VT = Ld->getValueType(0);
16721    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
16722        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
16723        !XTLI->getSubtarget()->is64Bit() &&
16724        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
16725      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
16726                                          Ld->getChain(), Op0, DAG);
16727      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
16728      return FILDChain;
16729    }
16730  }
16731  return SDValue();
16732}
16733
16734// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
16735static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
16736                                 X86TargetLowering::DAGCombinerInfo &DCI) {
16737  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
16738  // the result is either zero or one (depending on the input carry bit).
16739  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
16740  if (X86::isZeroNode(N->getOperand(0)) &&
16741      X86::isZeroNode(N->getOperand(1)) &&
16742      // We don't have a good way to replace an EFLAGS use, so only do this when
16743      // dead right now.
16744      SDValue(N, 1).use_empty()) {
16745    DebugLoc DL = N->getDebugLoc();
16746    EVT VT = N->getValueType(0);
16747    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
16748    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
16749                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
16750                                           DAG.getConstant(X86::COND_B,MVT::i8),
16751                                           N->getOperand(2)),
16752                               DAG.getConstant(1, VT));
16753    return DCI.CombineTo(N, Res1, CarryOut);
16754  }
16755
16756  return SDValue();
16757}
16758
16759// fold (add Y, (sete  X, 0)) -> adc  0, Y
16760//      (add Y, (setne X, 0)) -> sbb -1, Y
16761//      (sub (sete  X, 0), Y) -> sbb  0, Y
16762//      (sub (setne X, 0), Y) -> adc -1, Y
16763static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
16764  DebugLoc DL = N->getDebugLoc();
16765
16766  // Look through ZExts.
16767  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
16768  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
16769    return SDValue();
16770
16771  SDValue SetCC = Ext.getOperand(0);
16772  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
16773    return SDValue();
16774
16775  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
16776  if (CC != X86::COND_E && CC != X86::COND_NE)
16777    return SDValue();
16778
16779  SDValue Cmp = SetCC.getOperand(1);
16780  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
16781      !X86::isZeroNode(Cmp.getOperand(1)) ||
16782      !Cmp.getOperand(0).getValueType().isInteger())
16783    return SDValue();
16784
16785  SDValue CmpOp0 = Cmp.getOperand(0);
16786  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
16787                               DAG.getConstant(1, CmpOp0.getValueType()));
16788
16789  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
16790  if (CC == X86::COND_NE)
16791    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
16792                       DL, OtherVal.getValueType(), OtherVal,
16793                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
16794  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
16795                     DL, OtherVal.getValueType(), OtherVal,
16796                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
16797}
16798
16799/// PerformADDCombine - Do target-specific dag combines on integer adds.
16800static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
16801                                 const X86Subtarget *Subtarget) {
16802  EVT VT = N->getValueType(0);
16803  SDValue Op0 = N->getOperand(0);
16804  SDValue Op1 = N->getOperand(1);
16805
16806  // Try to synthesize horizontal adds from adds of shuffles.
16807  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
16808       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
16809      isHorizontalBinOp(Op0, Op1, true))
16810    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
16811
16812  return OptimizeConditionalInDecrement(N, DAG);
16813}
16814
16815static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
16816                                 const X86Subtarget *Subtarget) {
16817  SDValue Op0 = N->getOperand(0);
16818  SDValue Op1 = N->getOperand(1);
16819
16820  // X86 can't encode an immediate LHS of a sub. See if we can push the
16821  // negation into a preceding instruction.
16822  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
16823    // If the RHS of the sub is a XOR with one use and a constant, invert the
16824    // immediate. Then add one to the LHS of the sub so we can turn
16825    // X-Y -> X+~Y+1, saving one register.
16826    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
16827        isa<ConstantSDNode>(Op1.getOperand(1))) {
16828      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
16829      EVT VT = Op0.getValueType();
16830      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
16831                                   Op1.getOperand(0),
16832                                   DAG.getConstant(~XorC, VT));
16833      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
16834                         DAG.getConstant(C->getAPIntValue()+1, VT));
16835    }
16836  }
16837
16838  // Try to synthesize horizontal adds from adds of shuffles.
16839  EVT VT = N->getValueType(0);
16840  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
16841       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
16842      isHorizontalBinOp(Op0, Op1, true))
16843    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
16844
16845  return OptimizeConditionalInDecrement(N, DAG);
16846}
16847
16848/// performVZEXTCombine - Performs build vector combines
16849static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
16850                                        TargetLowering::DAGCombinerInfo &DCI,
16851                                        const X86Subtarget *Subtarget) {
16852  // (vzext (bitcast (vzext (x)) -> (vzext x)
16853  SDValue In = N->getOperand(0);
16854  while (In.getOpcode() == ISD::BITCAST)
16855    In = In.getOperand(0);
16856
16857  if (In.getOpcode() != X86ISD::VZEXT)
16858    return SDValue();
16859
16860  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
16861}
16862
16863SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
16864                                             DAGCombinerInfo &DCI) const {
16865  SelectionDAG &DAG = DCI.DAG;
16866  switch (N->getOpcode()) {
16867  default: break;
16868  case ISD::EXTRACT_VECTOR_ELT:
16869    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
16870  case ISD::VSELECT:
16871  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
16872  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
16873  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
16874  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
16875  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
16876  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
16877  case ISD::SHL:
16878  case ISD::SRA:
16879  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
16880  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
16881  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
16882  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
16883  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
16884  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
16885  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
16886  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
16887  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
16888  case X86ISD::FXOR:
16889  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
16890  case X86ISD::FMIN:
16891  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
16892  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
16893  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
16894  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
16895  case ISD::ANY_EXTEND:
16896  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
16897  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
16898  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
16899  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
16900  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
16901  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
16902  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
16903  case X86ISD::SHUFP:       // Handle all target specific shuffles
16904  case X86ISD::PALIGN:
16905  case X86ISD::UNPCKH:
16906  case X86ISD::UNPCKL:
16907  case X86ISD::MOVHLPS:
16908  case X86ISD::MOVLHPS:
16909  case X86ISD::PSHUFD:
16910  case X86ISD::PSHUFHW:
16911  case X86ISD::PSHUFLW:
16912  case X86ISD::MOVSS:
16913  case X86ISD::MOVSD:
16914  case X86ISD::VPERMILP:
16915  case X86ISD::VPERM2X128:
16916  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
16917  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
16918  }
16919
16920  return SDValue();
16921}
16922
16923/// isTypeDesirableForOp - Return true if the target has native support for
16924/// the specified value type and it is 'desirable' to use the type for the
16925/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
16926/// instruction encodings are longer and some i16 instructions are slow.
16927bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
16928  if (!isTypeLegal(VT))
16929    return false;
16930  if (VT != MVT::i16)
16931    return true;
16932
16933  switch (Opc) {
16934  default:
16935    return true;
16936  case ISD::LOAD:
16937  case ISD::SIGN_EXTEND:
16938  case ISD::ZERO_EXTEND:
16939  case ISD::ANY_EXTEND:
16940  case ISD::SHL:
16941  case ISD::SRL:
16942  case ISD::SUB:
16943  case ISD::ADD:
16944  case ISD::MUL:
16945  case ISD::AND:
16946  case ISD::OR:
16947  case ISD::XOR:
16948    return false;
16949  }
16950}
16951
16952/// IsDesirableToPromoteOp - This method query the target whether it is
16953/// beneficial for dag combiner to promote the specified node. If true, it
16954/// should return the desired promotion type by reference.
16955bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
16956  EVT VT = Op.getValueType();
16957  if (VT != MVT::i16)
16958    return false;
16959
16960  bool Promote = false;
16961  bool Commute = false;
16962  switch (Op.getOpcode()) {
16963  default: break;
16964  case ISD::LOAD: {
16965    LoadSDNode *LD = cast<LoadSDNode>(Op);
16966    // If the non-extending load has a single use and it's not live out, then it
16967    // might be folded.
16968    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
16969                                                     Op.hasOneUse()*/) {
16970      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16971             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
16972        // The only case where we'd want to promote LOAD (rather then it being
16973        // promoted as an operand is when it's only use is liveout.
16974        if (UI->getOpcode() != ISD::CopyToReg)
16975          return false;
16976      }
16977    }
16978    Promote = true;
16979    break;
16980  }
16981  case ISD::SIGN_EXTEND:
16982  case ISD::ZERO_EXTEND:
16983  case ISD::ANY_EXTEND:
16984    Promote = true;
16985    break;
16986  case ISD::SHL:
16987  case ISD::SRL: {
16988    SDValue N0 = Op.getOperand(0);
16989    // Look out for (store (shl (load), x)).
16990    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
16991      return false;
16992    Promote = true;
16993    break;
16994  }
16995  case ISD::ADD:
16996  case ISD::MUL:
16997  case ISD::AND:
16998  case ISD::OR:
16999  case ISD::XOR:
17000    Commute = true;
17001    // fallthrough
17002  case ISD::SUB: {
17003    SDValue N0 = Op.getOperand(0);
17004    SDValue N1 = Op.getOperand(1);
17005    if (!Commute && MayFoldLoad(N1))
17006      return false;
17007    // Avoid disabling potential load folding opportunities.
17008    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
17009      return false;
17010    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
17011      return false;
17012    Promote = true;
17013  }
17014  }
17015
17016  PVT = MVT::i32;
17017  return Promote;
17018}
17019
17020//===----------------------------------------------------------------------===//
17021//                           X86 Inline Assembly Support
17022//===----------------------------------------------------------------------===//
17023
17024namespace {
17025  // Helper to match a string separated by whitespace.
17026  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
17027    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
17028
17029    for (unsigned i = 0, e = args.size(); i != e; ++i) {
17030      StringRef piece(*args[i]);
17031      if (!s.startswith(piece)) // Check if the piece matches.
17032        return false;
17033
17034      s = s.substr(piece.size());
17035      StringRef::size_type pos = s.find_first_not_of(" \t");
17036      if (pos == 0) // We matched a prefix.
17037        return false;
17038
17039      s = s.substr(pos);
17040    }
17041
17042    return s.empty();
17043  }
17044  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
17045}
17046
17047bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
17048  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
17049
17050  std::string AsmStr = IA->getAsmString();
17051
17052  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
17053  if (!Ty || Ty->getBitWidth() % 16 != 0)
17054    return false;
17055
17056  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
17057  SmallVector<StringRef, 4> AsmPieces;
17058  SplitString(AsmStr, AsmPieces, ";\n");
17059
17060  switch (AsmPieces.size()) {
17061  default: return false;
17062  case 1:
17063    // FIXME: this should verify that we are targeting a 486 or better.  If not,
17064    // we will turn this bswap into something that will be lowered to logical
17065    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
17066    // lower so don't worry about this.
17067    // bswap $0
17068    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
17069        matchAsm(AsmPieces[0], "bswapl", "$0") ||
17070        matchAsm(AsmPieces[0], "bswapq", "$0") ||
17071        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
17072        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
17073        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
17074      // No need to check constraints, nothing other than the equivalent of
17075      // "=r,0" would be valid here.
17076      return IntrinsicLowering::LowerToByteSwap(CI);
17077    }
17078
17079    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
17080    if (CI->getType()->isIntegerTy(16) &&
17081        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
17082        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
17083         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
17084      AsmPieces.clear();
17085      const std::string &ConstraintsStr = IA->getConstraintString();
17086      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
17087      std::sort(AsmPieces.begin(), AsmPieces.end());
17088      if (AsmPieces.size() == 4 &&
17089          AsmPieces[0] == "~{cc}" &&
17090          AsmPieces[1] == "~{dirflag}" &&
17091          AsmPieces[2] == "~{flags}" &&
17092          AsmPieces[3] == "~{fpsr}")
17093      return IntrinsicLowering::LowerToByteSwap(CI);
17094    }
17095    break;
17096  case 3:
17097    if (CI->getType()->isIntegerTy(32) &&
17098        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
17099        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
17100        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
17101        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
17102      AsmPieces.clear();
17103      const std::string &ConstraintsStr = IA->getConstraintString();
17104      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
17105      std::sort(AsmPieces.begin(), AsmPieces.end());
17106      if (AsmPieces.size() == 4 &&
17107          AsmPieces[0] == "~{cc}" &&
17108          AsmPieces[1] == "~{dirflag}" &&
17109          AsmPieces[2] == "~{flags}" &&
17110          AsmPieces[3] == "~{fpsr}")
17111        return IntrinsicLowering::LowerToByteSwap(CI);
17112    }
17113
17114    if (CI->getType()->isIntegerTy(64)) {
17115      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
17116      if (Constraints.size() >= 2 &&
17117          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
17118          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
17119        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
17120        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
17121            matchAsm(AsmPieces[1], "bswap", "%edx") &&
17122            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
17123          return IntrinsicLowering::LowerToByteSwap(CI);
17124      }
17125    }
17126    break;
17127  }
17128  return false;
17129}
17130
17131
17132
17133/// getConstraintType - Given a constraint letter, return the type of
17134/// constraint it is for this target.
17135X86TargetLowering::ConstraintType
17136X86TargetLowering::getConstraintType(const std::string &Constraint) const {
17137  if (Constraint.size() == 1) {
17138    switch (Constraint[0]) {
17139    case 'R':
17140    case 'q':
17141    case 'Q':
17142    case 'f':
17143    case 't':
17144    case 'u':
17145    case 'y':
17146    case 'x':
17147    case 'Y':
17148    case 'l':
17149      return C_RegisterClass;
17150    case 'a':
17151    case 'b':
17152    case 'c':
17153    case 'd':
17154    case 'S':
17155    case 'D':
17156    case 'A':
17157      return C_Register;
17158    case 'I':
17159    case 'J':
17160    case 'K':
17161    case 'L':
17162    case 'M':
17163    case 'N':
17164    case 'G':
17165    case 'C':
17166    case 'e':
17167    case 'Z':
17168      return C_Other;
17169    default:
17170      break;
17171    }
17172  }
17173  return TargetLowering::getConstraintType(Constraint);
17174}
17175
17176/// Examine constraint type and operand type and determine a weight value.
17177/// This object must already have been set up with the operand type
17178/// and the current alternative constraint selected.
17179TargetLowering::ConstraintWeight
17180  X86TargetLowering::getSingleConstraintMatchWeight(
17181    AsmOperandInfo &info, const char *constraint) const {
17182  ConstraintWeight weight = CW_Invalid;
17183  Value *CallOperandVal = info.CallOperandVal;
17184    // If we don't have a value, we can't do a match,
17185    // but allow it at the lowest weight.
17186  if (CallOperandVal == NULL)
17187    return CW_Default;
17188  Type *type = CallOperandVal->getType();
17189  // Look at the constraint type.
17190  switch (*constraint) {
17191  default:
17192    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
17193  case 'R':
17194  case 'q':
17195  case 'Q':
17196  case 'a':
17197  case 'b':
17198  case 'c':
17199  case 'd':
17200  case 'S':
17201  case 'D':
17202  case 'A':
17203    if (CallOperandVal->getType()->isIntegerTy())
17204      weight = CW_SpecificReg;
17205    break;
17206  case 'f':
17207  case 't':
17208  case 'u':
17209      if (type->isFloatingPointTy())
17210        weight = CW_SpecificReg;
17211      break;
17212  case 'y':
17213      if (type->isX86_MMXTy() && Subtarget->hasMMX())
17214        weight = CW_SpecificReg;
17215      break;
17216  case 'x':
17217  case 'Y':
17218    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
17219        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
17220      weight = CW_Register;
17221    break;
17222  case 'I':
17223    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
17224      if (C->getZExtValue() <= 31)
17225        weight = CW_Constant;
17226    }
17227    break;
17228  case 'J':
17229    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17230      if (C->getZExtValue() <= 63)
17231        weight = CW_Constant;
17232    }
17233    break;
17234  case 'K':
17235    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17236      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
17237        weight = CW_Constant;
17238    }
17239    break;
17240  case 'L':
17241    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17242      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
17243        weight = CW_Constant;
17244    }
17245    break;
17246  case 'M':
17247    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17248      if (C->getZExtValue() <= 3)
17249        weight = CW_Constant;
17250    }
17251    break;
17252  case 'N':
17253    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17254      if (C->getZExtValue() <= 0xff)
17255        weight = CW_Constant;
17256    }
17257    break;
17258  case 'G':
17259  case 'C':
17260    if (dyn_cast<ConstantFP>(CallOperandVal)) {
17261      weight = CW_Constant;
17262    }
17263    break;
17264  case 'e':
17265    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17266      if ((C->getSExtValue() >= -0x80000000LL) &&
17267          (C->getSExtValue() <= 0x7fffffffLL))
17268        weight = CW_Constant;
17269    }
17270    break;
17271  case 'Z':
17272    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17273      if (C->getZExtValue() <= 0xffffffff)
17274        weight = CW_Constant;
17275    }
17276    break;
17277  }
17278  return weight;
17279}
17280
17281/// LowerXConstraint - try to replace an X constraint, which matches anything,
17282/// with another that has more specific requirements based on the type of the
17283/// corresponding operand.
17284const char *X86TargetLowering::
17285LowerXConstraint(EVT ConstraintVT) const {
17286  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
17287  // 'f' like normal targets.
17288  if (ConstraintVT.isFloatingPoint()) {
17289    if (Subtarget->hasSSE2())
17290      return "Y";
17291    if (Subtarget->hasSSE1())
17292      return "x";
17293  }
17294
17295  return TargetLowering::LowerXConstraint(ConstraintVT);
17296}
17297
17298/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
17299/// vector.  If it is invalid, don't add anything to Ops.
17300void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
17301                                                     std::string &Constraint,
17302                                                     std::vector<SDValue>&Ops,
17303                                                     SelectionDAG &DAG) const {
17304  SDValue Result(0, 0);
17305
17306  // Only support length 1 constraints for now.
17307  if (Constraint.length() > 1) return;
17308
17309  char ConstraintLetter = Constraint[0];
17310  switch (ConstraintLetter) {
17311  default: break;
17312  case 'I':
17313    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17314      if (C->getZExtValue() <= 31) {
17315        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17316        break;
17317      }
17318    }
17319    return;
17320  case 'J':
17321    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17322      if (C->getZExtValue() <= 63) {
17323        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17324        break;
17325      }
17326    }
17327    return;
17328  case 'K':
17329    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17330      if (isInt<8>(C->getSExtValue())) {
17331        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17332        break;
17333      }
17334    }
17335    return;
17336  case 'N':
17337    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17338      if (C->getZExtValue() <= 255) {
17339        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17340        break;
17341      }
17342    }
17343    return;
17344  case 'e': {
17345    // 32-bit signed value
17346    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17347      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
17348                                           C->getSExtValue())) {
17349        // Widen to 64 bits here to get it sign extended.
17350        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
17351        break;
17352      }
17353    // FIXME gcc accepts some relocatable values here too, but only in certain
17354    // memory models; it's complicated.
17355    }
17356    return;
17357  }
17358  case 'Z': {
17359    // 32-bit unsigned value
17360    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17361      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
17362                                           C->getZExtValue())) {
17363        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17364        break;
17365      }
17366    }
17367    // FIXME gcc accepts some relocatable values here too, but only in certain
17368    // memory models; it's complicated.
17369    return;
17370  }
17371  case 'i': {
17372    // Literal immediates are always ok.
17373    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
17374      // Widen to 64 bits here to get it sign extended.
17375      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
17376      break;
17377    }
17378
17379    // In any sort of PIC mode addresses need to be computed at runtime by
17380    // adding in a register or some sort of table lookup.  These can't
17381    // be used as immediates.
17382    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
17383      return;
17384
17385    // If we are in non-pic codegen mode, we allow the address of a global (with
17386    // an optional displacement) to be used with 'i'.
17387    GlobalAddressSDNode *GA = 0;
17388    int64_t Offset = 0;
17389
17390    // Match either (GA), (GA+C), (GA+C1+C2), etc.
17391    while (1) {
17392      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
17393        Offset += GA->getOffset();
17394        break;
17395      } else if (Op.getOpcode() == ISD::ADD) {
17396        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
17397          Offset += C->getZExtValue();
17398          Op = Op.getOperand(0);
17399          continue;
17400        }
17401      } else if (Op.getOpcode() == ISD::SUB) {
17402        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
17403          Offset += -C->getZExtValue();
17404          Op = Op.getOperand(0);
17405          continue;
17406        }
17407      }
17408
17409      // Otherwise, this isn't something we can handle, reject it.
17410      return;
17411    }
17412
17413    const GlobalValue *GV = GA->getGlobal();
17414    // If we require an extra load to get this address, as in PIC mode, we
17415    // can't accept it.
17416    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
17417                                                        getTargetMachine())))
17418      return;
17419
17420    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
17421                                        GA->getValueType(0), Offset);
17422    break;
17423  }
17424  }
17425
17426  if (Result.getNode()) {
17427    Ops.push_back(Result);
17428    return;
17429  }
17430  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
17431}
17432
17433std::pair<unsigned, const TargetRegisterClass*>
17434X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
17435                                                EVT VT) const {
17436  // First, see if this is a constraint that directly corresponds to an LLVM
17437  // register class.
17438  if (Constraint.size() == 1) {
17439    // GCC Constraint Letters
17440    switch (Constraint[0]) {
17441    default: break;
17442      // TODO: Slight differences here in allocation order and leaving
17443      // RIP in the class. Do they matter any more here than they do
17444      // in the normal allocation?
17445    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
17446      if (Subtarget->is64Bit()) {
17447        if (VT == MVT::i32 || VT == MVT::f32)
17448          return std::make_pair(0U, &X86::GR32RegClass);
17449        if (VT == MVT::i16)
17450          return std::make_pair(0U, &X86::GR16RegClass);
17451        if (VT == MVT::i8 || VT == MVT::i1)
17452          return std::make_pair(0U, &X86::GR8RegClass);
17453        if (VT == MVT::i64 || VT == MVT::f64)
17454          return std::make_pair(0U, &X86::GR64RegClass);
17455        break;
17456      }
17457      // 32-bit fallthrough
17458    case 'Q':   // Q_REGS
17459      if (VT == MVT::i32 || VT == MVT::f32)
17460        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
17461      if (VT == MVT::i16)
17462        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
17463      if (VT == MVT::i8 || VT == MVT::i1)
17464        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
17465      if (VT == MVT::i64)
17466        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
17467      break;
17468    case 'r':   // GENERAL_REGS
17469    case 'l':   // INDEX_REGS
17470      if (VT == MVT::i8 || VT == MVT::i1)
17471        return std::make_pair(0U, &X86::GR8RegClass);
17472      if (VT == MVT::i16)
17473        return std::make_pair(0U, &X86::GR16RegClass);
17474      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
17475        return std::make_pair(0U, &X86::GR32RegClass);
17476      return std::make_pair(0U, &X86::GR64RegClass);
17477    case 'R':   // LEGACY_REGS
17478      if (VT == MVT::i8 || VT == MVT::i1)
17479        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
17480      if (VT == MVT::i16)
17481        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
17482      if (VT == MVT::i32 || !Subtarget->is64Bit())
17483        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
17484      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
17485    case 'f':  // FP Stack registers.
17486      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
17487      // value to the correct fpstack register class.
17488      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
17489        return std::make_pair(0U, &X86::RFP32RegClass);
17490      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
17491        return std::make_pair(0U, &X86::RFP64RegClass);
17492      return std::make_pair(0U, &X86::RFP80RegClass);
17493    case 'y':   // MMX_REGS if MMX allowed.
17494      if (!Subtarget->hasMMX()) break;
17495      return std::make_pair(0U, &X86::VR64RegClass);
17496    case 'Y':   // SSE_REGS if SSE2 allowed
17497      if (!Subtarget->hasSSE2()) break;
17498      // FALL THROUGH.
17499    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
17500      if (!Subtarget->hasSSE1()) break;
17501
17502      switch (VT.getSimpleVT().SimpleTy) {
17503      default: break;
17504      // Scalar SSE types.
17505      case MVT::f32:
17506      case MVT::i32:
17507        return std::make_pair(0U, &X86::FR32RegClass);
17508      case MVT::f64:
17509      case MVT::i64:
17510        return std::make_pair(0U, &X86::FR64RegClass);
17511      // Vector types.
17512      case MVT::v16i8:
17513      case MVT::v8i16:
17514      case MVT::v4i32:
17515      case MVT::v2i64:
17516      case MVT::v4f32:
17517      case MVT::v2f64:
17518        return std::make_pair(0U, &X86::VR128RegClass);
17519      // AVX types.
17520      case MVT::v32i8:
17521      case MVT::v16i16:
17522      case MVT::v8i32:
17523      case MVT::v4i64:
17524      case MVT::v8f32:
17525      case MVT::v4f64:
17526        return std::make_pair(0U, &X86::VR256RegClass);
17527      }
17528      break;
17529    }
17530  }
17531
17532  // Use the default implementation in TargetLowering to convert the register
17533  // constraint into a member of a register class.
17534  std::pair<unsigned, const TargetRegisterClass*> Res;
17535  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
17536
17537  // Not found as a standard register?
17538  if (Res.second == 0) {
17539    // Map st(0) -> st(7) -> ST0
17540    if (Constraint.size() == 7 && Constraint[0] == '{' &&
17541        tolower(Constraint[1]) == 's' &&
17542        tolower(Constraint[2]) == 't' &&
17543        Constraint[3] == '(' &&
17544        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
17545        Constraint[5] == ')' &&
17546        Constraint[6] == '}') {
17547
17548      Res.first = X86::ST0+Constraint[4]-'0';
17549      Res.second = &X86::RFP80RegClass;
17550      return Res;
17551    }
17552
17553    // GCC allows "st(0)" to be called just plain "st".
17554    if (StringRef("{st}").equals_lower(Constraint)) {
17555      Res.first = X86::ST0;
17556      Res.second = &X86::RFP80RegClass;
17557      return Res;
17558    }
17559
17560    // flags -> EFLAGS
17561    if (StringRef("{flags}").equals_lower(Constraint)) {
17562      Res.first = X86::EFLAGS;
17563      Res.second = &X86::CCRRegClass;
17564      return Res;
17565    }
17566
17567    // 'A' means EAX + EDX.
17568    if (Constraint == "A") {
17569      Res.first = X86::EAX;
17570      Res.second = &X86::GR32_ADRegClass;
17571      return Res;
17572    }
17573    return Res;
17574  }
17575
17576  // Otherwise, check to see if this is a register class of the wrong value
17577  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
17578  // turn into {ax},{dx}.
17579  if (Res.second->hasType(VT))
17580    return Res;   // Correct type already, nothing to do.
17581
17582  // All of the single-register GCC register classes map their values onto
17583  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
17584  // really want an 8-bit or 32-bit register, map to the appropriate register
17585  // class and return the appropriate register.
17586  if (Res.second == &X86::GR16RegClass) {
17587    if (VT == MVT::i8) {
17588      unsigned DestReg = 0;
17589      switch (Res.first) {
17590      default: break;
17591      case X86::AX: DestReg = X86::AL; break;
17592      case X86::DX: DestReg = X86::DL; break;
17593      case X86::CX: DestReg = X86::CL; break;
17594      case X86::BX: DestReg = X86::BL; break;
17595      }
17596      if (DestReg) {
17597        Res.first = DestReg;
17598        Res.second = &X86::GR8RegClass;
17599      }
17600    } else if (VT == MVT::i32) {
17601      unsigned DestReg = 0;
17602      switch (Res.first) {
17603      default: break;
17604      case X86::AX: DestReg = X86::EAX; break;
17605      case X86::DX: DestReg = X86::EDX; break;
17606      case X86::CX: DestReg = X86::ECX; break;
17607      case X86::BX: DestReg = X86::EBX; break;
17608      case X86::SI: DestReg = X86::ESI; break;
17609      case X86::DI: DestReg = X86::EDI; break;
17610      case X86::BP: DestReg = X86::EBP; break;
17611      case X86::SP: DestReg = X86::ESP; break;
17612      }
17613      if (DestReg) {
17614        Res.first = DestReg;
17615        Res.second = &X86::GR32RegClass;
17616      }
17617    } else if (VT == MVT::i64) {
17618      unsigned DestReg = 0;
17619      switch (Res.first) {
17620      default: break;
17621      case X86::AX: DestReg = X86::RAX; break;
17622      case X86::DX: DestReg = X86::RDX; break;
17623      case X86::CX: DestReg = X86::RCX; break;
17624      case X86::BX: DestReg = X86::RBX; break;
17625      case X86::SI: DestReg = X86::RSI; break;
17626      case X86::DI: DestReg = X86::RDI; break;
17627      case X86::BP: DestReg = X86::RBP; break;
17628      case X86::SP: DestReg = X86::RSP; break;
17629      }
17630      if (DestReg) {
17631        Res.first = DestReg;
17632        Res.second = &X86::GR64RegClass;
17633      }
17634    }
17635  } else if (Res.second == &X86::FR32RegClass ||
17636             Res.second == &X86::FR64RegClass ||
17637             Res.second == &X86::VR128RegClass) {
17638    // Handle references to XMM physical registers that got mapped into the
17639    // wrong class.  This can happen with constraints like {xmm0} where the
17640    // target independent register mapper will just pick the first match it can
17641    // find, ignoring the required type.
17642
17643    if (VT == MVT::f32 || VT == MVT::i32)
17644      Res.second = &X86::FR32RegClass;
17645    else if (VT == MVT::f64 || VT == MVT::i64)
17646      Res.second = &X86::FR64RegClass;
17647    else if (X86::VR128RegClass.hasType(VT))
17648      Res.second = &X86::VR128RegClass;
17649    else if (X86::VR256RegClass.hasType(VT))
17650      Res.second = &X86::VR256RegClass;
17651  }
17652
17653  return Res;
17654}
17655
17656//===----------------------------------------------------------------------===//
17657//
17658// X86 cost model.
17659//
17660//===----------------------------------------------------------------------===//
17661
17662struct X86CostTblEntry {
17663  int ISD;
17664  MVT Type;
17665  unsigned Cost;
17666};
17667
17668static int
17669FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
17670  for (unsigned int i = 0; i < len; ++i)
17671    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
17672      return i;
17673
17674  // Could not find an entry.
17675  return -1;
17676}
17677
17678struct X86TypeConversionCostTblEntry {
17679  int ISD;
17680  MVT Dst;
17681  MVT Src;
17682  unsigned Cost;
17683};
17684
17685static int
17686FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
17687                   int ISD, MVT Dst, MVT Src) {
17688  for (unsigned int i = 0; i < len; ++i)
17689    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
17690      return i;
17691
17692  // Could not find an entry.
17693  return -1;
17694}
17695
17696ScalarTargetTransformInfo::PopcntHwSupport
17697X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
17698  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
17699  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
17700
17701  // TODO: Currently the __builtin_popcount() implementation using SSE3
17702  //   instructions is inefficient. Once the problem is fixed, we should
17703  //   call ST.hasSSE3() instead of ST.hasSSE4().
17704  return ST.hasSSE41() ? Fast : None;
17705}
17706
17707unsigned
17708X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
17709                                                     Type *Ty) const {
17710  // Legalize the type.
17711  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
17712
17713  int ISD = InstructionOpcodeToISD(Opcode);
17714  assert(ISD && "Invalid opcode");
17715
17716  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
17717
17718  static const X86CostTblEntry AVX1CostTable[] = {
17719    // We don't have to scalarize unsupported ops. We can issue two half-sized
17720    // operations and we only need to extract the upper YMM half.
17721    // Two ops + 1 extract + 1 insert = 4.
17722    { ISD::MUL,     MVT::v8i32,    4 },
17723    { ISD::SUB,     MVT::v8i32,    4 },
17724    { ISD::ADD,     MVT::v8i32,    4 },
17725    { ISD::MUL,     MVT::v4i64,    4 },
17726    { ISD::SUB,     MVT::v4i64,    4 },
17727    { ISD::ADD,     MVT::v4i64,    4 },
17728    };
17729
17730  // Look for AVX1 lowering tricks.
17731  if (ST.hasAVX()) {
17732    int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
17733                          LT.second);
17734    if (Idx != -1)
17735      return LT.first * AVX1CostTable[Idx].Cost;
17736  }
17737  // Fallback to the default implementation.
17738  return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
17739}
17740
17741unsigned
17742X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
17743                                                 unsigned Index) const {
17744  assert(Val->isVectorTy() && "This must be a vector type");
17745
17746  if (Index != -1U) {
17747    // Legalize the type.
17748    std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
17749
17750    // This type is legalized to a scalar type.
17751    if (!LT.second.isVector())
17752      return 0;
17753
17754    // The type may be split. Normalize the index to the new type.
17755    unsigned Width = LT.second.getVectorNumElements();
17756    Index = Index % Width;
17757
17758    // Floating point scalars are already located in index #0.
17759    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
17760      return 0;
17761  }
17762
17763  return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
17764}
17765
17766unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
17767                                                          Type *ValTy,
17768                                                          Type *CondTy) const {
17769  // Legalize the type.
17770  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
17771
17772  MVT MTy = LT.second;
17773
17774  int ISD = InstructionOpcodeToISD(Opcode);
17775  assert(ISD && "Invalid opcode");
17776
17777  const X86Subtarget &ST =
17778  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
17779
17780  static const X86CostTblEntry SSE42CostTbl[] = {
17781    { ISD::SETCC,   MVT::v2f64,   1 },
17782    { ISD::SETCC,   MVT::v4f32,   1 },
17783    { ISD::SETCC,   MVT::v2i64,   1 },
17784    { ISD::SETCC,   MVT::v4i32,   1 },
17785    { ISD::SETCC,   MVT::v8i16,   1 },
17786    { ISD::SETCC,   MVT::v16i8,   1 },
17787  };
17788
17789  static const X86CostTblEntry AVX1CostTbl[] = {
17790    { ISD::SETCC,   MVT::v4f64,   1 },
17791    { ISD::SETCC,   MVT::v8f32,   1 },
17792    // AVX1 does not support 8-wide integer compare.
17793    { ISD::SETCC,   MVT::v4i64,   4 },
17794    { ISD::SETCC,   MVT::v8i32,   4 },
17795    { ISD::SETCC,   MVT::v16i16,  4 },
17796    { ISD::SETCC,   MVT::v32i8,   4 },
17797  };
17798
17799  static const X86CostTblEntry AVX2CostTbl[] = {
17800    { ISD::SETCC,   MVT::v4i64,   1 },
17801    { ISD::SETCC,   MVT::v8i32,   1 },
17802    { ISD::SETCC,   MVT::v16i16,  1 },
17803    { ISD::SETCC,   MVT::v32i8,   1 },
17804  };
17805
17806  if (ST.hasSSE42()) {
17807    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
17808    if (Idx != -1)
17809      return LT.first * SSE42CostTbl[Idx].Cost;
17810  }
17811
17812  if (ST.hasAVX()) {
17813    int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
17814    if (Idx != -1)
17815      return LT.first * AVX1CostTbl[Idx].Cost;
17816  }
17817
17818  if (ST.hasAVX2()) {
17819    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
17820    if (Idx != -1)
17821      return LT.first * AVX2CostTbl[Idx].Cost;
17822  }
17823
17824  return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
17825}
17826
17827unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
17828                                                        Type *Dst,
17829                                                        Type *Src) const {
17830  int ISD = InstructionOpcodeToISD(Opcode);
17831  assert(ISD && "Invalid opcode");
17832
17833  EVT SrcTy = TLI->getValueType(Src);
17834  EVT DstTy = TLI->getValueType(Dst);
17835
17836  if (!SrcTy.isSimple() || !DstTy.isSimple())
17837    return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
17838
17839  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
17840
17841  static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
17842    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
17843    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
17844    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
17845    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
17846    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
17847    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
17848    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
17849    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
17850    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
17851    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
17852    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
17853    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
17854    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
17855    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
17856    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
17857  };
17858
17859  if (ST.hasAVX()) {
17860    int Idx = FindInConvertTable(AVXConversionTbl,
17861                                 array_lengthof(AVXConversionTbl),
17862                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
17863    if (Idx != -1)
17864      return AVXConversionTbl[Idx].Cost;
17865  }
17866
17867  return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
17868}
17869
17870