X86ISelLowering.cpp revision d6fb53adb19ccfbfb1eedec11c899aaa8401d036
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86ISelLowering.h"
17#include "Utils/X86ShuffleDecode.h"
18#include "X86.h"
19#include "X86InstrBuilder.h"
20#include "X86TargetMachine.h"
21#include "X86TargetObjectFile.h"
22#include "llvm/ADT/SmallSet.h"
23#include "llvm/ADT/Statistic.h"
24#include "llvm/ADT/StringExtras.h"
25#include "llvm/ADT/VariadicFunction.h"
26#include "llvm/CallingConv.h"
27#include "llvm/CodeGen/IntrinsicLowering.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineJumpTableInfo.h"
32#include "llvm/CodeGen/MachineModuleInfo.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/Constants.h"
35#include "llvm/DerivedTypes.h"
36#include "llvm/Function.h"
37#include "llvm/GlobalAlias.h"
38#include "llvm/GlobalVariable.h"
39#include "llvm/Instructions.h"
40#include "llvm/Intrinsics.h"
41#include "llvm/LLVMContext.h"
42#include "llvm/MC/MCAsmInfo.h"
43#include "llvm/MC/MCContext.h"
44#include "llvm/MC/MCExpr.h"
45#include "llvm/MC/MCSymbol.h"
46#include "llvm/Support/CallSite.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/ErrorHandling.h"
49#include "llvm/Support/MathExtras.h"
50#include "llvm/Target/TargetOptions.h"
51#include <bitset>
52#include <cctype>
53using namespace llvm;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57// Forward declarations.
58static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
59                       SDValue V2);
60
61/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
62/// sets things up to match to an AVX VEXTRACTF128 instruction or a
63/// simple subregister reference.  Idx is an index in the 128 bits we
64/// want.  It need not be aligned to a 128-bit bounday.  That makes
65/// lowering EXTRACT_VECTOR_ELT operations easier.
66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
67                                   SelectionDAG &DAG, DebugLoc dl) {
68  EVT VT = Vec.getValueType();
69  assert(VT.is256BitVector() && "Unexpected vector size!");
70  EVT ElVT = VT.getVectorElementType();
71  unsigned Factor = VT.getSizeInBits()/128;
72  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
73                                  VT.getVectorNumElements()/Factor);
74
75  // Extract from UNDEF is UNDEF.
76  if (Vec.getOpcode() == ISD::UNDEF)
77    return DAG.getUNDEF(ResultVT);
78
79  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
80  // we can match to VEXTRACTF128.
81  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
82
83  // This is the index of the first element of the 128-bit chunk
84  // we want.
85  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
86                               * ElemsPerChunk);
87
88  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
89  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
90                               VecIdx);
91
92  return Result;
93}
94
95/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
96/// sets things up to match to an AVX VINSERTF128 instruction or a
97/// simple superregister reference.  Idx is an index in the 128 bits
98/// we want.  It need not be aligned to a 128-bit bounday.  That makes
99/// lowering INSERT_VECTOR_ELT operations easier.
100static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
101                                  unsigned IdxVal, SelectionDAG &DAG,
102                                  DebugLoc dl) {
103  // Inserting UNDEF is Result
104  if (Vec.getOpcode() == ISD::UNDEF)
105    return Result;
106
107  EVT VT = Vec.getValueType();
108  assert(VT.is128BitVector() && "Unexpected vector size!");
109
110  EVT ElVT = VT.getVectorElementType();
111  EVT ResultVT = Result.getValueType();
112
113  // Insert the relevant 128 bits.
114  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
115
116  // This is the index of the first element of the 128-bit chunk
117  // we want.
118  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
119                               * ElemsPerChunk);
120
121  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
122  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
123                     VecIdx);
124}
125
126/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
127/// instructions. This is used because creating CONCAT_VECTOR nodes of
128/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
129/// large BUILD_VECTORS.
130static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
131                                   unsigned NumElems, SelectionDAG &DAG,
132                                   DebugLoc dl) {
133  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
134  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
135}
136
137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
138  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
139  bool is64Bit = Subtarget->is64Bit();
140
141  if (Subtarget->isTargetEnvMacho()) {
142    if (is64Bit)
143      return new X86_64MachoTargetObjectFile();
144    return new TargetLoweringObjectFileMachO();
145  }
146
147  if (Subtarget->isTargetLinux())
148    return new X86LinuxTargetObjectFile();
149  if (Subtarget->isTargetELF())
150    return new TargetLoweringObjectFileELF();
151  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
152    return new TargetLoweringObjectFileCOFF();
153  llvm_unreachable("unknown subtarget type");
154}
155
156X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
157  : TargetLowering(TM, createTLOF(TM)) {
158  Subtarget = &TM.getSubtarget<X86Subtarget>();
159  X86ScalarSSEf64 = Subtarget->hasSSE2();
160  X86ScalarSSEf32 = Subtarget->hasSSE1();
161
162  RegInfo = TM.getRegisterInfo();
163  TD = getDataLayout();
164
165  // Set up the TargetLowering object.
166  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
167
168  // X86 is weird, it always uses i8 for shift amounts and setcc results.
169  setBooleanContents(ZeroOrOneBooleanContent);
170  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
171  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
172
173  // For 64-bit since we have so many registers use the ILP scheduler, for
174  // 32-bit code use the register pressure specific scheduling.
175  // For Atom, always use ILP scheduling.
176  if (Subtarget->isAtom())
177    setSchedulingPreference(Sched::ILP);
178  else if (Subtarget->is64Bit())
179    setSchedulingPreference(Sched::ILP);
180  else
181    setSchedulingPreference(Sched::RegPressure);
182  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
183
184  // Bypass i32 with i8 on Atom when compiling with O2
185  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
186    addBypassSlowDiv(32, 8);
187
188  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
189    // Setup Windows compiler runtime calls.
190    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
191    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
192    setLibcallName(RTLIB::SREM_I64, "_allrem");
193    setLibcallName(RTLIB::UREM_I64, "_aullrem");
194    setLibcallName(RTLIB::MUL_I64, "_allmul");
195    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
196    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
197    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
198    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
199    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
200
201    // The _ftol2 runtime function has an unusual calling conv, which
202    // is modeled by a special pseudo-instruction.
203    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
204    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
205    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
206    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
207  }
208
209  if (Subtarget->isTargetDarwin()) {
210    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
211    setUseUnderscoreSetJmp(false);
212    setUseUnderscoreLongJmp(false);
213  } else if (Subtarget->isTargetMingw()) {
214    // MS runtime is weird: it exports _setjmp, but longjmp!
215    setUseUnderscoreSetJmp(true);
216    setUseUnderscoreLongJmp(false);
217  } else {
218    setUseUnderscoreSetJmp(true);
219    setUseUnderscoreLongJmp(true);
220  }
221
222  // Set up the register classes.
223  addRegisterClass(MVT::i8, &X86::GR8RegClass);
224  addRegisterClass(MVT::i16, &X86::GR16RegClass);
225  addRegisterClass(MVT::i32, &X86::GR32RegClass);
226  if (Subtarget->is64Bit())
227    addRegisterClass(MVT::i64, &X86::GR64RegClass);
228
229  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
230
231  // We don't accept any truncstore of integer registers.
232  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
233  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
234  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
235  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
236  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
237  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
238
239  // SETOEQ and SETUNE require checking two conditions.
240  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
241  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
242  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
243  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
244  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
245  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
246
247  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
248  // operation.
249  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
250  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
251  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
252
253  if (Subtarget->is64Bit()) {
254    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
255    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
256  } else if (!TM.Options.UseSoftFloat) {
257    // We have an algorithm for SSE2->double, and we turn this into a
258    // 64-bit FILD followed by conditional FADD for other targets.
259    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
260    // We have an algorithm for SSE2, and we turn this into a 64-bit
261    // FILD for other targets.
262    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
263  }
264
265  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
266  // this operation.
267  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
268  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
269
270  if (!TM.Options.UseSoftFloat) {
271    // SSE has no i16 to fp conversion, only i32
272    if (X86ScalarSSEf32) {
273      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
274      // f32 and f64 cases are Legal, f80 case is not
275      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
276    } else {
277      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
278      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
279    }
280  } else {
281    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
282    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
283  }
284
285  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
286  // are Legal, f80 is custom lowered.
287  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
288  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
289
290  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
291  // this operation.
292  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
293  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
294
295  if (X86ScalarSSEf32) {
296    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
297    // f32 and f64 cases are Legal, f80 case is not
298    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
299  } else {
300    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
301    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
302  }
303
304  // Handle FP_TO_UINT by promoting the destination to a larger signed
305  // conversion.
306  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
307  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
308  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
309
310  if (Subtarget->is64Bit()) {
311    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
312    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
313  } else if (!TM.Options.UseSoftFloat) {
314    // Since AVX is a superset of SSE3, only check for SSE here.
315    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
316      // Expand FP_TO_UINT into a select.
317      // FIXME: We would like to use a Custom expander here eventually to do
318      // the optimal thing for SSE vs. the default expansion in the legalizer.
319      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
320    else
321      // With SSE3 we can use fisttpll to convert to a signed i64; without
322      // SSE, we're stuck with a fistpll.
323      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
324  }
325
326  if (isTargetFTOL()) {
327    // Use the _ftol2 runtime function, which has a pseudo-instruction
328    // to handle its weird calling convention.
329    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
330  }
331
332  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
333  if (!X86ScalarSSEf64) {
334    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
335    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
336    if (Subtarget->is64Bit()) {
337      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
338      // Without SSE, i64->f64 goes through memory.
339      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
340    }
341  }
342
343  // Scalar integer divide and remainder are lowered to use operations that
344  // produce two results, to match the available instructions. This exposes
345  // the two-result form to trivial CSE, which is able to combine x/y and x%y
346  // into a single instruction.
347  //
348  // Scalar integer multiply-high is also lowered to use two-result
349  // operations, to match the available instructions. However, plain multiply
350  // (low) operations are left as Legal, as there are single-result
351  // instructions for this in x86. Using the two-result multiply instructions
352  // when both high and low results are needed must be arranged by dagcombine.
353  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
354    MVT VT = IntVTs[i];
355    setOperationAction(ISD::MULHS, VT, Expand);
356    setOperationAction(ISD::MULHU, VT, Expand);
357    setOperationAction(ISD::SDIV, VT, Expand);
358    setOperationAction(ISD::UDIV, VT, Expand);
359    setOperationAction(ISD::SREM, VT, Expand);
360    setOperationAction(ISD::UREM, VT, Expand);
361
362    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
363    setOperationAction(ISD::ADDC, VT, Custom);
364    setOperationAction(ISD::ADDE, VT, Custom);
365    setOperationAction(ISD::SUBC, VT, Custom);
366    setOperationAction(ISD::SUBE, VT, Custom);
367  }
368
369  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
370  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
371  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
372  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
373  if (Subtarget->is64Bit())
374    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
375  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
376  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
377  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
378  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
379  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
380  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
381  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
382  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
383
384  // Promote the i8 variants and force them on up to i32 which has a shorter
385  // encoding.
386  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
387  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
388  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
389  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
390  if (Subtarget->hasBMI()) {
391    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
392    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
393    if (Subtarget->is64Bit())
394      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
395  } else {
396    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
397    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
398    if (Subtarget->is64Bit())
399      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
400  }
401
402  if (Subtarget->hasLZCNT()) {
403    // When promoting the i8 variants, force them to i32 for a shorter
404    // encoding.
405    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
406    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
407    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
408    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
409    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
410    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
411    if (Subtarget->is64Bit())
412      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
413  } else {
414    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
415    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
416    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
417    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
418    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
419    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
420    if (Subtarget->is64Bit()) {
421      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
422      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
423    }
424  }
425
426  if (Subtarget->hasPOPCNT()) {
427    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
428  } else {
429    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
430    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
431    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
432    if (Subtarget->is64Bit())
433      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
434  }
435
436  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
437  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
438
439  // These should be promoted to a larger select which is supported.
440  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
441  // X86 wants to expand cmov itself.
442  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
443  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
444  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
445  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
446  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
447  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
448  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
449  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
450  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
451  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
452  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
453  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
454  if (Subtarget->is64Bit()) {
455    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
456    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
457  }
458  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
459  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
460  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
461  // support continuation, user-level threading, and etc.. As a result, no
462  // other SjLj exception interfaces are implemented and please don't build
463  // your own exception handling based on them.
464  // LLVM/Clang supports zero-cost DWARF exception handling.
465  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
466  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
467
468  // Darwin ABI issue.
469  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
470  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
471  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
472  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
473  if (Subtarget->is64Bit())
474    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
475  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
476  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
477  if (Subtarget->is64Bit()) {
478    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
479    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
480    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
481    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
482    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
483  }
484  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
485  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
486  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
487  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
488  if (Subtarget->is64Bit()) {
489    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
490    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
491    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
492  }
493
494  if (Subtarget->hasSSE1())
495    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
496
497  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
498  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
499
500  // On X86 and X86-64, atomic operations are lowered to locked instructions.
501  // Locked instructions, in turn, have implicit fence semantics (all memory
502  // operations are flushed before issuing the locked instruction, and they
503  // are not buffered), so we can fold away the common pattern of
504  // fence-atomic-fence.
505  setShouldFoldAtomicFences(true);
506
507  // Expand certain atomics
508  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
509    MVT VT = IntVTs[i];
510    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
511    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
512    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
513  }
514
515  if (!Subtarget->is64Bit()) {
516    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
517    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
518    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
519    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
520    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
521    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
522    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
523    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
524    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
525    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
526    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
527    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
528  }
529
530  if (Subtarget->hasCmpxchg16b()) {
531    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
532  }
533
534  // FIXME - use subtarget debug flags
535  if (!Subtarget->isTargetDarwin() &&
536      !Subtarget->isTargetELF() &&
537      !Subtarget->isTargetCygMing()) {
538    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
539  }
540
541  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
542  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
543  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
544  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
545  if (Subtarget->is64Bit()) {
546    setExceptionPointerRegister(X86::RAX);
547    setExceptionSelectorRegister(X86::RDX);
548  } else {
549    setExceptionPointerRegister(X86::EAX);
550    setExceptionSelectorRegister(X86::EDX);
551  }
552  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
553  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
554
555  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
556  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
557
558  setOperationAction(ISD::TRAP, MVT::Other, Legal);
559  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
560
561  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
562  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
563  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
564  if (Subtarget->is64Bit()) {
565    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
566    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
567  } else {
568    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
569    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
570  }
571
572  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
573  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
574
575  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
576    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
577                       MVT::i64 : MVT::i32, Custom);
578  else if (TM.Options.EnableSegmentedStacks)
579    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
580                       MVT::i64 : MVT::i32, Custom);
581  else
582    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
583                       MVT::i64 : MVT::i32, Expand);
584
585  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
586    // f32 and f64 use SSE.
587    // Set up the FP register classes.
588    addRegisterClass(MVT::f32, &X86::FR32RegClass);
589    addRegisterClass(MVT::f64, &X86::FR64RegClass);
590
591    // Use ANDPD to simulate FABS.
592    setOperationAction(ISD::FABS , MVT::f64, Custom);
593    setOperationAction(ISD::FABS , MVT::f32, Custom);
594
595    // Use XORP to simulate FNEG.
596    setOperationAction(ISD::FNEG , MVT::f64, Custom);
597    setOperationAction(ISD::FNEG , MVT::f32, Custom);
598
599    // Use ANDPD and ORPD to simulate FCOPYSIGN.
600    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
601    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
602
603    // Lower this to FGETSIGNx86 plus an AND.
604    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
605    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
606
607    // We don't support sin/cos/fmod
608    setOperationAction(ISD::FSIN , MVT::f64, Expand);
609    setOperationAction(ISD::FCOS , MVT::f64, Expand);
610    setOperationAction(ISD::FSIN , MVT::f32, Expand);
611    setOperationAction(ISD::FCOS , MVT::f32, Expand);
612
613    // Expand FP immediates into loads from the stack, except for the special
614    // cases we handle.
615    addLegalFPImmediate(APFloat(+0.0)); // xorpd
616    addLegalFPImmediate(APFloat(+0.0f)); // xorps
617  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
618    // Use SSE for f32, x87 for f64.
619    // Set up the FP register classes.
620    addRegisterClass(MVT::f32, &X86::FR32RegClass);
621    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
622
623    // Use ANDPS to simulate FABS.
624    setOperationAction(ISD::FABS , MVT::f32, Custom);
625
626    // Use XORP to simulate FNEG.
627    setOperationAction(ISD::FNEG , MVT::f32, Custom);
628
629    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
630
631    // Use ANDPS and ORPS to simulate FCOPYSIGN.
632    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
633    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
634
635    // We don't support sin/cos/fmod
636    setOperationAction(ISD::FSIN , MVT::f32, Expand);
637    setOperationAction(ISD::FCOS , MVT::f32, Expand);
638
639    // Special cases we handle for FP constants.
640    addLegalFPImmediate(APFloat(+0.0f)); // xorps
641    addLegalFPImmediate(APFloat(+0.0)); // FLD0
642    addLegalFPImmediate(APFloat(+1.0)); // FLD1
643    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
644    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
645
646    if (!TM.Options.UnsafeFPMath) {
647      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
648      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
649    }
650  } else if (!TM.Options.UseSoftFloat) {
651    // f32 and f64 in x87.
652    // Set up the FP register classes.
653    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
654    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
655
656    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
657    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
658    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
659    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
660
661    if (!TM.Options.UnsafeFPMath) {
662      setOperationAction(ISD::FSIN           , MVT::f32  , Expand);
663      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
664      setOperationAction(ISD::FCOS           , MVT::f32  , Expand);
665      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
666    }
667    addLegalFPImmediate(APFloat(+0.0)); // FLD0
668    addLegalFPImmediate(APFloat(+1.0)); // FLD1
669    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
670    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
671    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
672    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
673    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
674    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
675  }
676
677  // We don't support FMA.
678  setOperationAction(ISD::FMA, MVT::f64, Expand);
679  setOperationAction(ISD::FMA, MVT::f32, Expand);
680
681  // Long double always uses X87.
682  if (!TM.Options.UseSoftFloat) {
683    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
684    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
685    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
686    {
687      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
688      addLegalFPImmediate(TmpFlt);  // FLD0
689      TmpFlt.changeSign();
690      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
691
692      bool ignored;
693      APFloat TmpFlt2(+1.0);
694      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
695                      &ignored);
696      addLegalFPImmediate(TmpFlt2);  // FLD1
697      TmpFlt2.changeSign();
698      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
699    }
700
701    if (!TM.Options.UnsafeFPMath) {
702      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
703      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
704    }
705
706    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
707    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
708    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
709    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
710    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
711    setOperationAction(ISD::FMA, MVT::f80, Expand);
712  }
713
714  // Always use a library call for pow.
715  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
716  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
717  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
718
719  setOperationAction(ISD::FLOG, MVT::f80, Expand);
720  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
721  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
722  setOperationAction(ISD::FEXP, MVT::f80, Expand);
723  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
724
725  // First set operation action for all vector types to either promote
726  // (for widening) or expand (for scalarization). Then we will selectively
727  // turn on ones that can be effectively codegen'd.
728  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
729           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
730    MVT VT = (MVT::SimpleValueType)i;
731    setOperationAction(ISD::ADD , VT, Expand);
732    setOperationAction(ISD::SUB , VT, Expand);
733    setOperationAction(ISD::FADD, VT, Expand);
734    setOperationAction(ISD::FNEG, VT, Expand);
735    setOperationAction(ISD::FSUB, VT, Expand);
736    setOperationAction(ISD::MUL , VT, Expand);
737    setOperationAction(ISD::FMUL, VT, Expand);
738    setOperationAction(ISD::SDIV, VT, Expand);
739    setOperationAction(ISD::UDIV, VT, Expand);
740    setOperationAction(ISD::FDIV, VT, Expand);
741    setOperationAction(ISD::SREM, VT, Expand);
742    setOperationAction(ISD::UREM, VT, Expand);
743    setOperationAction(ISD::LOAD, VT, Expand);
744    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
745    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
746    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
747    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
748    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
749    setOperationAction(ISD::FABS, VT, Expand);
750    setOperationAction(ISD::FSIN, VT, Expand);
751    setOperationAction(ISD::FCOS, VT, Expand);
752    setOperationAction(ISD::FREM, VT, Expand);
753    setOperationAction(ISD::FMA,  VT, Expand);
754    setOperationAction(ISD::FPOWI, VT, Expand);
755    setOperationAction(ISD::FSQRT, VT, Expand);
756    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
757    setOperationAction(ISD::FFLOOR, VT, Expand);
758    setOperationAction(ISD::FCEIL, VT, Expand);
759    setOperationAction(ISD::FTRUNC, VT, Expand);
760    setOperationAction(ISD::FRINT, VT, Expand);
761    setOperationAction(ISD::FNEARBYINT, VT, Expand);
762    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
763    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
764    setOperationAction(ISD::SDIVREM, VT, Expand);
765    setOperationAction(ISD::UDIVREM, VT, Expand);
766    setOperationAction(ISD::FPOW, VT, Expand);
767    setOperationAction(ISD::CTPOP, VT, Expand);
768    setOperationAction(ISD::CTTZ, VT, Expand);
769    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
770    setOperationAction(ISD::CTLZ, VT, Expand);
771    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
772    setOperationAction(ISD::SHL, VT, Expand);
773    setOperationAction(ISD::SRA, VT, Expand);
774    setOperationAction(ISD::SRL, VT, Expand);
775    setOperationAction(ISD::ROTL, VT, Expand);
776    setOperationAction(ISD::ROTR, VT, Expand);
777    setOperationAction(ISD::BSWAP, VT, Expand);
778    setOperationAction(ISD::SETCC, VT, Expand);
779    setOperationAction(ISD::FLOG, VT, Expand);
780    setOperationAction(ISD::FLOG2, VT, Expand);
781    setOperationAction(ISD::FLOG10, VT, Expand);
782    setOperationAction(ISD::FEXP, VT, Expand);
783    setOperationAction(ISD::FEXP2, VT, Expand);
784    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
785    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
786    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
787    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
788    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
789    setOperationAction(ISD::TRUNCATE, VT, Expand);
790    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
791    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
792    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
793    setOperationAction(ISD::VSELECT, VT, Expand);
794    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
795             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
796      setTruncStoreAction(VT,
797                          (MVT::SimpleValueType)InnerVT, Expand);
798    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
799    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
800    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
801  }
802
803  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
804  // with -msoft-float, disable use of MMX as well.
805  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
806    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
807    // No operations on x86mmx supported, everything uses intrinsics.
808  }
809
810  // MMX-sized vectors (other than x86mmx) are expected to be expanded
811  // into smaller operations.
812  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
813  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
814  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
815  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
816  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
817  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
818  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
819  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
820  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
821  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
822  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
823  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
824  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
825  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
826  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
827  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
828  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
829  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
830  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
831  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
832  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
833  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
834  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
835  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
836  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
837  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
838  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
839  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
840  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
841
842  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
843    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
844
845    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
846    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
847    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
848    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
849    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
850    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
851    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
852    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
853    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
854    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
855    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
856    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
857  }
858
859  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
860    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
861
862    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
863    // registers cannot be used even for integer operations.
864    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
865    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
866    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
867    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
868
869    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
870    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
871    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
872    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
873    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
874    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
875    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
876    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
877    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
878    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
879    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
880    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
881    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
882    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
883    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
884    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
885    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
886    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
887
888    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
889    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
890    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
891    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
892
893    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
894    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
895    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
896    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
897    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
898
899    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
900    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
901      MVT VT = (MVT::SimpleValueType)i;
902      // Do not attempt to custom lower non-power-of-2 vectors
903      if (!isPowerOf2_32(VT.getVectorNumElements()))
904        continue;
905      // Do not attempt to custom lower non-128-bit vectors
906      if (!VT.is128BitVector())
907        continue;
908      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
909      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
910      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
911    }
912
913    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
914    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
915    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
916    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
917    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
918    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
919
920    if (Subtarget->is64Bit()) {
921      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
922      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
923    }
924
925    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
926    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
927      MVT VT = (MVT::SimpleValueType)i;
928
929      // Do not attempt to promote non-128-bit vectors
930      if (!VT.is128BitVector())
931        continue;
932
933      setOperationAction(ISD::AND,    VT, Promote);
934      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
935      setOperationAction(ISD::OR,     VT, Promote);
936      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
937      setOperationAction(ISD::XOR,    VT, Promote);
938      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
939      setOperationAction(ISD::LOAD,   VT, Promote);
940      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
941      setOperationAction(ISD::SELECT, VT, Promote);
942      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
943    }
944
945    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
946
947    // Custom lower v2i64 and v2f64 selects.
948    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
949    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
950    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
951    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
952
953    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
954    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
955
956    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
957    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
958    // As there is no 64-bit GPR available, we need build a special custom
959    // sequence to convert from v2i32 to v2f32.
960    if (!Subtarget->is64Bit())
961      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
962
963    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
964    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
965
966    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
967  }
968
969  if (Subtarget->hasSSE41()) {
970    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
971    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
972    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
973    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
974    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
975    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
976    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
977    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
978    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
979    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
980
981    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
982    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
983    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
984    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
985    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
986    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
987    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
988    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
989    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
990    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
991
992    // FIXME: Do we need to handle scalar-to-vector here?
993    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
994
995    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
996    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
997    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
998    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
999    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
1000
1001    // i8 and i16 vectors are custom , because the source register and source
1002    // source memory operand types are not the same width.  f32 vectors are
1003    // custom since the immediate controlling the insert encodes additional
1004    // information.
1005    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1006    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1007    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1008    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1009
1010    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1011    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1012    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1013    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1014
1015    // FIXME: these should be Legal but thats only for the case where
1016    // the index is constant.  For now custom expand to deal with that.
1017    if (Subtarget->is64Bit()) {
1018      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1019      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1020    }
1021  }
1022
1023  if (Subtarget->hasSSE2()) {
1024    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1025    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1026
1027    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1028    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1029
1030    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1031    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1032
1033    if (Subtarget->hasInt256()) {
1034      setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
1035      setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
1036
1037      setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
1038      setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
1039
1040      setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
1041    } else {
1042      setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
1043      setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
1044
1045      setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
1046      setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
1047
1048      setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
1049    }
1050  }
1051
1052  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1053    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1054    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1055    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1056    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1057    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1058    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1059
1060    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1061    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1062    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1063
1064    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1065    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1066    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1067    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1068    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1069    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1070    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1071    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1072    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1073    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1074    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1075    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1076
1077    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1078    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1079    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1080    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1081    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1082    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1083    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1084    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1085    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1086    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1087    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1088    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1089
1090    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
1091    setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
1092
1093    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
1094
1095    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1096    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1097    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1098
1099    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
1100    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1101    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1102
1103    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
1104
1105    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1106    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1107
1108    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1109    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1110
1111    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1112    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1113
1114    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1115    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1116    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1117    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1118
1119    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1120    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1121    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1122
1123    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
1124    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
1125    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
1126    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
1127
1128    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1129      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1130      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1131      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1132      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1133      setOperationAction(ISD::FMA,             MVT::f32, Legal);
1134      setOperationAction(ISD::FMA,             MVT::f64, Legal);
1135    }
1136
1137    if (Subtarget->hasInt256()) {
1138      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1139      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1140      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1141      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1142
1143      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1144      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1145      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1146      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1147
1148      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1149      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1150      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1151      // Don't lower v32i8 because there is no 128-bit byte mul
1152
1153      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1154
1155      setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
1156      setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
1157
1158      setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
1159      setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
1160
1161      setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
1162    } else {
1163      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1164      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1165      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1166      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1167
1168      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1169      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1170      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1171      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1172
1173      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1174      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1175      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1176      // Don't lower v32i8 because there is no 128-bit byte mul
1177
1178      setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
1179      setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
1180
1181      setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
1182      setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
1183
1184      setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
1185    }
1186
1187    // Custom lower several nodes for 256-bit types.
1188    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1189             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1190      MVT VT = (MVT::SimpleValueType)i;
1191
1192      // Extract subvector is special because the value type
1193      // (result) is 128-bit but the source is 256-bit wide.
1194      if (VT.is128BitVector())
1195        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1196
1197      // Do not attempt to custom lower other non-256-bit vectors
1198      if (!VT.is256BitVector())
1199        continue;
1200
1201      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1202      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1203      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1204      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1205      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1206      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1207      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1208    }
1209
1210    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1211    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1212      MVT VT = (MVT::SimpleValueType)i;
1213
1214      // Do not attempt to promote non-256-bit vectors
1215      if (!VT.is256BitVector())
1216        continue;
1217
1218      setOperationAction(ISD::AND,    VT, Promote);
1219      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1220      setOperationAction(ISD::OR,     VT, Promote);
1221      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1222      setOperationAction(ISD::XOR,    VT, Promote);
1223      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1224      setOperationAction(ISD::LOAD,   VT, Promote);
1225      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1226      setOperationAction(ISD::SELECT, VT, Promote);
1227      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1228    }
1229  }
1230
1231  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1232  // of this type with custom code.
1233  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1234           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1235    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1236                       Custom);
1237  }
1238
1239  // We want to custom lower some of our intrinsics.
1240  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1241  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1242
1243  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1244  // handle type legalization for these operations here.
1245  //
1246  // FIXME: We really should do custom legalization for addition and
1247  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1248  // than generic legalization for 64-bit multiplication-with-overflow, though.
1249  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1250    // Add/Sub/Mul with overflow operations are custom lowered.
1251    MVT VT = IntVTs[i];
1252    setOperationAction(ISD::SADDO, VT, Custom);
1253    setOperationAction(ISD::UADDO, VT, Custom);
1254    setOperationAction(ISD::SSUBO, VT, Custom);
1255    setOperationAction(ISD::USUBO, VT, Custom);
1256    setOperationAction(ISD::SMULO, VT, Custom);
1257    setOperationAction(ISD::UMULO, VT, Custom);
1258  }
1259
1260  // There are no 8-bit 3-address imul/mul instructions
1261  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1262  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1263
1264  if (!Subtarget->is64Bit()) {
1265    // These libcalls are not available in 32-bit.
1266    setLibcallName(RTLIB::SHL_I128, 0);
1267    setLibcallName(RTLIB::SRL_I128, 0);
1268    setLibcallName(RTLIB::SRA_I128, 0);
1269  }
1270
1271  // We have target-specific dag combine patterns for the following nodes:
1272  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1273  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1274  setTargetDAGCombine(ISD::VSELECT);
1275  setTargetDAGCombine(ISD::SELECT);
1276  setTargetDAGCombine(ISD::SHL);
1277  setTargetDAGCombine(ISD::SRA);
1278  setTargetDAGCombine(ISD::SRL);
1279  setTargetDAGCombine(ISD::OR);
1280  setTargetDAGCombine(ISD::AND);
1281  setTargetDAGCombine(ISD::ADD);
1282  setTargetDAGCombine(ISD::FADD);
1283  setTargetDAGCombine(ISD::FSUB);
1284  setTargetDAGCombine(ISD::FMA);
1285  setTargetDAGCombine(ISD::SUB);
1286  setTargetDAGCombine(ISD::LOAD);
1287  setTargetDAGCombine(ISD::STORE);
1288  setTargetDAGCombine(ISD::ZERO_EXTEND);
1289  setTargetDAGCombine(ISD::ANY_EXTEND);
1290  setTargetDAGCombine(ISD::SIGN_EXTEND);
1291  setTargetDAGCombine(ISD::TRUNCATE);
1292  setTargetDAGCombine(ISD::SINT_TO_FP);
1293  setTargetDAGCombine(ISD::SETCC);
1294  if (Subtarget->is64Bit())
1295    setTargetDAGCombine(ISD::MUL);
1296  setTargetDAGCombine(ISD::XOR);
1297
1298  computeRegisterProperties();
1299
1300  // On Darwin, -Os means optimize for size without hurting performance,
1301  // do not reduce the limit.
1302  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1303  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1304  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1305  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1306  maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1307  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1308  setPrefLoopAlignment(4); // 2^4 bytes.
1309  benefitFromCodePlacementOpt = true;
1310
1311  // Predictable cmov don't hurt on atom because it's in-order.
1312  predictableSelectIsExpensive = !Subtarget->isAtom();
1313
1314  setPrefFunctionAlignment(4); // 2^4 bytes.
1315}
1316
1317EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
1318  if (!VT.isVector()) return MVT::i8;
1319  return VT.changeVectorElementTypeToInteger();
1320}
1321
1322/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1323/// the desired ByVal argument alignment.
1324static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1325  if (MaxAlign == 16)
1326    return;
1327  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1328    if (VTy->getBitWidth() == 128)
1329      MaxAlign = 16;
1330  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1331    unsigned EltAlign = 0;
1332    getMaxByValAlign(ATy->getElementType(), EltAlign);
1333    if (EltAlign > MaxAlign)
1334      MaxAlign = EltAlign;
1335  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1336    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1337      unsigned EltAlign = 0;
1338      getMaxByValAlign(STy->getElementType(i), EltAlign);
1339      if (EltAlign > MaxAlign)
1340        MaxAlign = EltAlign;
1341      if (MaxAlign == 16)
1342        break;
1343    }
1344  }
1345}
1346
1347/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1348/// function arguments in the caller parameter area. For X86, aggregates
1349/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1350/// are at 4-byte boundaries.
1351unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1352  if (Subtarget->is64Bit()) {
1353    // Max of 8 and alignment of type.
1354    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1355    if (TyAlign > 8)
1356      return TyAlign;
1357    return 8;
1358  }
1359
1360  unsigned Align = 4;
1361  if (Subtarget->hasSSE1())
1362    getMaxByValAlign(Ty, Align);
1363  return Align;
1364}
1365
1366/// getOptimalMemOpType - Returns the target specific optimal type for load
1367/// and store operations as a result of memset, memcpy, and memmove
1368/// lowering. If DstAlign is zero that means it's safe to destination
1369/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1370/// means there isn't a need to check it against alignment requirement,
1371/// probably because the source does not need to be loaded. If 'IsMemset' is
1372/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1373/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1374/// source is constant so it does not need to be loaded.
1375/// It returns EVT::Other if the type should be determined using generic
1376/// target-independent logic.
1377EVT
1378X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1379                                       unsigned DstAlign, unsigned SrcAlign,
1380                                       bool IsMemset, bool ZeroMemset,
1381                                       bool MemcpyStrSrc,
1382                                       MachineFunction &MF) const {
1383  const Function *F = MF.getFunction();
1384  if ((!IsMemset || ZeroMemset) &&
1385      !F->getFnAttributes().hasAttribute(Attribute::NoImplicitFloat)) {
1386    if (Size >= 16 &&
1387        (Subtarget->isUnalignedMemAccessFast() ||
1388         ((DstAlign == 0 || DstAlign >= 16) &&
1389          (SrcAlign == 0 || SrcAlign >= 16)))) {
1390      if (Size >= 32) {
1391        if (Subtarget->hasInt256())
1392          return MVT::v8i32;
1393        if (Subtarget->hasFp256())
1394          return MVT::v8f32;
1395      }
1396      if (Subtarget->hasSSE2())
1397        return MVT::v4i32;
1398      if (Subtarget->hasSSE1())
1399        return MVT::v4f32;
1400    } else if (!MemcpyStrSrc && Size >= 8 &&
1401               !Subtarget->is64Bit() &&
1402               Subtarget->hasSSE2()) {
1403      // Do not use f64 to lower memcpy if source is string constant. It's
1404      // better to use i32 to avoid the loads.
1405      return MVT::f64;
1406    }
1407  }
1408  if (Subtarget->is64Bit() && Size >= 8)
1409    return MVT::i64;
1410  return MVT::i32;
1411}
1412
1413bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1414  if (VT == MVT::f32)
1415    return X86ScalarSSEf32;
1416  else if (VT == MVT::f64)
1417    return X86ScalarSSEf64;
1418  return true;
1419}
1420
1421bool
1422X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
1423  if (Fast)
1424    *Fast = Subtarget->isUnalignedMemAccessFast();
1425  return true;
1426}
1427
1428/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1429/// current function.  The returned value is a member of the
1430/// MachineJumpTableInfo::JTEntryKind enum.
1431unsigned X86TargetLowering::getJumpTableEncoding() const {
1432  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1433  // symbol.
1434  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1435      Subtarget->isPICStyleGOT())
1436    return MachineJumpTableInfo::EK_Custom32;
1437
1438  // Otherwise, use the normal jump table encoding heuristics.
1439  return TargetLowering::getJumpTableEncoding();
1440}
1441
1442const MCExpr *
1443X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1444                                             const MachineBasicBlock *MBB,
1445                                             unsigned uid,MCContext &Ctx) const{
1446  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1447         Subtarget->isPICStyleGOT());
1448  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1449  // entries.
1450  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1451                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1452}
1453
1454/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1455/// jumptable.
1456SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1457                                                    SelectionDAG &DAG) const {
1458  if (!Subtarget->is64Bit())
1459    // This doesn't have DebugLoc associated with it, but is not really the
1460    // same as a Register.
1461    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1462  return Table;
1463}
1464
1465/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1466/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1467/// MCExpr.
1468const MCExpr *X86TargetLowering::
1469getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1470                             MCContext &Ctx) const {
1471  // X86-64 uses RIP relative addressing based on the jump table label.
1472  if (Subtarget->isPICStyleRIPRel())
1473    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1474
1475  // Otherwise, the reference is relative to the PIC base.
1476  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1477}
1478
1479// FIXME: Why this routine is here? Move to RegInfo!
1480std::pair<const TargetRegisterClass*, uint8_t>
1481X86TargetLowering::findRepresentativeClass(MVT VT) const{
1482  const TargetRegisterClass *RRC = 0;
1483  uint8_t Cost = 1;
1484  switch (VT.SimpleTy) {
1485  default:
1486    return TargetLowering::findRepresentativeClass(VT);
1487  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1488    RRC = Subtarget->is64Bit() ?
1489      (const TargetRegisterClass*)&X86::GR64RegClass :
1490      (const TargetRegisterClass*)&X86::GR32RegClass;
1491    break;
1492  case MVT::x86mmx:
1493    RRC = &X86::VR64RegClass;
1494    break;
1495  case MVT::f32: case MVT::f64:
1496  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1497  case MVT::v4f32: case MVT::v2f64:
1498  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1499  case MVT::v4f64:
1500    RRC = &X86::VR128RegClass;
1501    break;
1502  }
1503  return std::make_pair(RRC, Cost);
1504}
1505
1506bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1507                                               unsigned &Offset) const {
1508  if (!Subtarget->isTargetLinux())
1509    return false;
1510
1511  if (Subtarget->is64Bit()) {
1512    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1513    Offset = 0x28;
1514    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1515      AddressSpace = 256;
1516    else
1517      AddressSpace = 257;
1518  } else {
1519    // %gs:0x14 on i386
1520    Offset = 0x14;
1521    AddressSpace = 256;
1522  }
1523  return true;
1524}
1525
1526//===----------------------------------------------------------------------===//
1527//               Return Value Calling Convention Implementation
1528//===----------------------------------------------------------------------===//
1529
1530#include "X86GenCallingConv.inc"
1531
1532bool
1533X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1534                                  MachineFunction &MF, bool isVarArg,
1535                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1536                        LLVMContext &Context) const {
1537  SmallVector<CCValAssign, 16> RVLocs;
1538  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1539                 RVLocs, Context);
1540  return CCInfo.CheckReturn(Outs, RetCC_X86);
1541}
1542
1543SDValue
1544X86TargetLowering::LowerReturn(SDValue Chain,
1545                               CallingConv::ID CallConv, bool isVarArg,
1546                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1547                               const SmallVectorImpl<SDValue> &OutVals,
1548                               DebugLoc dl, SelectionDAG &DAG) const {
1549  MachineFunction &MF = DAG.getMachineFunction();
1550  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1551
1552  SmallVector<CCValAssign, 16> RVLocs;
1553  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1554                 RVLocs, *DAG.getContext());
1555  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1556
1557  // Add the regs to the liveout set for the function.
1558  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1559  for (unsigned i = 0; i != RVLocs.size(); ++i)
1560    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1561      MRI.addLiveOut(RVLocs[i].getLocReg());
1562
1563  SDValue Flag;
1564
1565  SmallVector<SDValue, 6> RetOps;
1566  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1567  // Operand #1 = Bytes To Pop
1568  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1569                   MVT::i16));
1570
1571  // Copy the result values into the output registers.
1572  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1573    CCValAssign &VA = RVLocs[i];
1574    assert(VA.isRegLoc() && "Can only return in registers!");
1575    SDValue ValToCopy = OutVals[i];
1576    EVT ValVT = ValToCopy.getValueType();
1577
1578    // Promote values to the appropriate types
1579    if (VA.getLocInfo() == CCValAssign::SExt)
1580      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1581    else if (VA.getLocInfo() == CCValAssign::ZExt)
1582      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1583    else if (VA.getLocInfo() == CCValAssign::AExt)
1584      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1585    else if (VA.getLocInfo() == CCValAssign::BCvt)
1586      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1587
1588    // If this is x86-64, and we disabled SSE, we can't return FP values,
1589    // or SSE or MMX vectors.
1590    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1591         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1592          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1593      report_fatal_error("SSE register return with SSE disabled");
1594    }
1595    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1596    // llvm-gcc has never done it right and no one has noticed, so this
1597    // should be OK for now.
1598    if (ValVT == MVT::f64 &&
1599        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1600      report_fatal_error("SSE2 register return with SSE2 disabled");
1601
1602    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1603    // the RET instruction and handled by the FP Stackifier.
1604    if (VA.getLocReg() == X86::ST0 ||
1605        VA.getLocReg() == X86::ST1) {
1606      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1607      // change the value to the FP stack register class.
1608      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1609        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1610      RetOps.push_back(ValToCopy);
1611      // Don't emit a copytoreg.
1612      continue;
1613    }
1614
1615    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1616    // which is returned in RAX / RDX.
1617    if (Subtarget->is64Bit()) {
1618      if (ValVT == MVT::x86mmx) {
1619        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1620          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1621          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1622                                  ValToCopy);
1623          // If we don't have SSE2 available, convert to v4f32 so the generated
1624          // register is legal.
1625          if (!Subtarget->hasSSE2())
1626            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1627        }
1628      }
1629    }
1630
1631    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1632    Flag = Chain.getValue(1);
1633  }
1634
1635  // The x86-64 ABI for returning structs by value requires that we copy
1636  // the sret argument into %rax for the return. We saved the argument into
1637  // a virtual register in the entry block, so now we copy the value out
1638  // and into %rax.
1639  if (Subtarget->is64Bit() &&
1640      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1641    MachineFunction &MF = DAG.getMachineFunction();
1642    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1643    unsigned Reg = FuncInfo->getSRetReturnReg();
1644    assert(Reg &&
1645           "SRetReturnReg should have been set in LowerFormalArguments().");
1646    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1647
1648    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1649    Flag = Chain.getValue(1);
1650
1651    // RAX now acts like a return value.
1652    MRI.addLiveOut(X86::RAX);
1653  }
1654
1655  RetOps[0] = Chain;  // Update chain.
1656
1657  // Add the flag if we have it.
1658  if (Flag.getNode())
1659    RetOps.push_back(Flag);
1660
1661  return DAG.getNode(X86ISD::RET_FLAG, dl,
1662                     MVT::Other, &RetOps[0], RetOps.size());
1663}
1664
1665bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1666  if (N->getNumValues() != 1)
1667    return false;
1668  if (!N->hasNUsesOfValue(1, 0))
1669    return false;
1670
1671  SDValue TCChain = Chain;
1672  SDNode *Copy = *N->use_begin();
1673  if (Copy->getOpcode() == ISD::CopyToReg) {
1674    // If the copy has a glue operand, we conservatively assume it isn't safe to
1675    // perform a tail call.
1676    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1677      return false;
1678    TCChain = Copy->getOperand(0);
1679  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1680    return false;
1681
1682  bool HasRet = false;
1683  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1684       UI != UE; ++UI) {
1685    if (UI->getOpcode() != X86ISD::RET_FLAG)
1686      return false;
1687    HasRet = true;
1688  }
1689
1690  if (!HasRet)
1691    return false;
1692
1693  Chain = TCChain;
1694  return true;
1695}
1696
1697MVT
1698X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
1699                                            ISD::NodeType ExtendKind) const {
1700  MVT ReturnMVT;
1701  // TODO: Is this also valid on 32-bit?
1702  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1703    ReturnMVT = MVT::i8;
1704  else
1705    ReturnMVT = MVT::i32;
1706
1707  MVT MinVT = getRegisterType(ReturnMVT);
1708  return VT.bitsLT(MinVT) ? MinVT : VT;
1709}
1710
1711/// LowerCallResult - Lower the result values of a call into the
1712/// appropriate copies out of appropriate physical registers.
1713///
1714SDValue
1715X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1716                                   CallingConv::ID CallConv, bool isVarArg,
1717                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1718                                   DebugLoc dl, SelectionDAG &DAG,
1719                                   SmallVectorImpl<SDValue> &InVals) const {
1720
1721  // Assign locations to each value returned by this call.
1722  SmallVector<CCValAssign, 16> RVLocs;
1723  bool Is64Bit = Subtarget->is64Bit();
1724  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1725                 getTargetMachine(), RVLocs, *DAG.getContext());
1726  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1727
1728  // Copy all of the result registers out of their specified physreg.
1729  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1730    CCValAssign &VA = RVLocs[i];
1731    EVT CopyVT = VA.getValVT();
1732
1733    // If this is x86-64, and we disabled SSE, we can't return FP values
1734    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1735        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1736      report_fatal_error("SSE register return with SSE disabled");
1737    }
1738
1739    SDValue Val;
1740
1741    // If this is a call to a function that returns an fp value on the floating
1742    // point stack, we must guarantee the value is popped from the stack, so
1743    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1744    // if the return value is not used. We use the FpPOP_RETVAL instruction
1745    // instead.
1746    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1747      // If we prefer to use the value in xmm registers, copy it out as f80 and
1748      // use a truncate to move it from fp stack reg to xmm reg.
1749      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1750      SDValue Ops[] = { Chain, InFlag };
1751      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1752                                         MVT::Other, MVT::Glue, Ops, 2), 1);
1753      Val = Chain.getValue(0);
1754
1755      // Round the f80 to the right size, which also moves it to the appropriate
1756      // xmm register.
1757      if (CopyVT != VA.getValVT())
1758        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1759                          // This truncation won't change the value.
1760                          DAG.getIntPtrConstant(1));
1761    } else {
1762      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1763                                 CopyVT, InFlag).getValue(1);
1764      Val = Chain.getValue(0);
1765    }
1766    InFlag = Chain.getValue(2);
1767    InVals.push_back(Val);
1768  }
1769
1770  return Chain;
1771}
1772
1773//===----------------------------------------------------------------------===//
1774//                C & StdCall & Fast Calling Convention implementation
1775//===----------------------------------------------------------------------===//
1776//  StdCall calling convention seems to be standard for many Windows' API
1777//  routines and around. It differs from C calling convention just a little:
1778//  callee should clean up the stack, not caller. Symbols should be also
1779//  decorated in some fancy way :) It doesn't support any vector arguments.
1780//  For info on fast calling convention see Fast Calling Convention (tail call)
1781//  implementation LowerX86_32FastCCCallTo.
1782
1783/// CallIsStructReturn - Determines whether a call uses struct return
1784/// semantics.
1785enum StructReturnType {
1786  NotStructReturn,
1787  RegStructReturn,
1788  StackStructReturn
1789};
1790static StructReturnType
1791callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1792  if (Outs.empty())
1793    return NotStructReturn;
1794
1795  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
1796  if (!Flags.isSRet())
1797    return NotStructReturn;
1798  if (Flags.isInReg())
1799    return RegStructReturn;
1800  return StackStructReturn;
1801}
1802
1803/// ArgsAreStructReturn - Determines whether a function uses struct
1804/// return semantics.
1805static StructReturnType
1806argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1807  if (Ins.empty())
1808    return NotStructReturn;
1809
1810  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
1811  if (!Flags.isSRet())
1812    return NotStructReturn;
1813  if (Flags.isInReg())
1814    return RegStructReturn;
1815  return StackStructReturn;
1816}
1817
1818/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1819/// by "Src" to address "Dst" with size and alignment information specified by
1820/// the specific parameter attribute. The copy will be passed as a byval
1821/// function parameter.
1822static SDValue
1823CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1824                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1825                          DebugLoc dl) {
1826  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1827
1828  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1829                       /*isVolatile*/false, /*AlwaysInline=*/true,
1830                       MachinePointerInfo(), MachinePointerInfo());
1831}
1832
1833/// IsTailCallConvention - Return true if the calling convention is one that
1834/// supports tail call optimization.
1835static bool IsTailCallConvention(CallingConv::ID CC) {
1836  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1837          CC == CallingConv::HiPE);
1838}
1839
1840bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1841  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
1842    return false;
1843
1844  CallSite CS(CI);
1845  CallingConv::ID CalleeCC = CS.getCallingConv();
1846  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1847    return false;
1848
1849  return true;
1850}
1851
1852/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1853/// a tailcall target by changing its ABI.
1854static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
1855                                   bool GuaranteedTailCallOpt) {
1856  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1857}
1858
1859SDValue
1860X86TargetLowering::LowerMemArgument(SDValue Chain,
1861                                    CallingConv::ID CallConv,
1862                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1863                                    DebugLoc dl, SelectionDAG &DAG,
1864                                    const CCValAssign &VA,
1865                                    MachineFrameInfo *MFI,
1866                                    unsigned i) const {
1867  // Create the nodes corresponding to a load from this parameter slot.
1868  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1869  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
1870                              getTargetMachine().Options.GuaranteedTailCallOpt);
1871  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1872  EVT ValVT;
1873
1874  // If value is passed by pointer we have address passed instead of the value
1875  // itself.
1876  if (VA.getLocInfo() == CCValAssign::Indirect)
1877    ValVT = VA.getLocVT();
1878  else
1879    ValVT = VA.getValVT();
1880
1881  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1882  // changed with more analysis.
1883  // In case of tail call optimization mark all arguments mutable. Since they
1884  // could be overwritten by lowering of arguments in case of a tail call.
1885  if (Flags.isByVal()) {
1886    unsigned Bytes = Flags.getByValSize();
1887    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1888    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
1889    return DAG.getFrameIndex(FI, getPointerTy());
1890  } else {
1891    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1892                                    VA.getLocMemOffset(), isImmutable);
1893    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1894    return DAG.getLoad(ValVT, dl, Chain, FIN,
1895                       MachinePointerInfo::getFixedStack(FI),
1896                       false, false, false, 0);
1897  }
1898}
1899
1900SDValue
1901X86TargetLowering::LowerFormalArguments(SDValue Chain,
1902                                        CallingConv::ID CallConv,
1903                                        bool isVarArg,
1904                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1905                                        DebugLoc dl,
1906                                        SelectionDAG &DAG,
1907                                        SmallVectorImpl<SDValue> &InVals)
1908                                          const {
1909  MachineFunction &MF = DAG.getMachineFunction();
1910  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1911
1912  const Function* Fn = MF.getFunction();
1913  if (Fn->hasExternalLinkage() &&
1914      Subtarget->isTargetCygMing() &&
1915      Fn->getName() == "main")
1916    FuncInfo->setForceFramePointer(true);
1917
1918  MachineFrameInfo *MFI = MF.getFrameInfo();
1919  bool Is64Bit = Subtarget->is64Bit();
1920  bool IsWindows = Subtarget->isTargetWindows();
1921  bool IsWin64 = Subtarget->isTargetWin64();
1922
1923  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1924         "Var args not supported with calling convention fastcc, ghc or hipe");
1925
1926  // Assign locations to all of the incoming arguments.
1927  SmallVector<CCValAssign, 16> ArgLocs;
1928  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1929                 ArgLocs, *DAG.getContext());
1930
1931  // Allocate shadow area for Win64
1932  if (IsWin64) {
1933    CCInfo.AllocateStack(32, 8);
1934  }
1935
1936  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
1937
1938  unsigned LastVal = ~0U;
1939  SDValue ArgValue;
1940  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1941    CCValAssign &VA = ArgLocs[i];
1942    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1943    // places.
1944    assert(VA.getValNo() != LastVal &&
1945           "Don't support value assigned to multiple locs yet");
1946    (void)LastVal;
1947    LastVal = VA.getValNo();
1948
1949    if (VA.isRegLoc()) {
1950      EVT RegVT = VA.getLocVT();
1951      const TargetRegisterClass *RC;
1952      if (RegVT == MVT::i32)
1953        RC = &X86::GR32RegClass;
1954      else if (Is64Bit && RegVT == MVT::i64)
1955        RC = &X86::GR64RegClass;
1956      else if (RegVT == MVT::f32)
1957        RC = &X86::FR32RegClass;
1958      else if (RegVT == MVT::f64)
1959        RC = &X86::FR64RegClass;
1960      else if (RegVT.is256BitVector())
1961        RC = &X86::VR256RegClass;
1962      else if (RegVT.is128BitVector())
1963        RC = &X86::VR128RegClass;
1964      else if (RegVT == MVT::x86mmx)
1965        RC = &X86::VR64RegClass;
1966      else
1967        llvm_unreachable("Unknown argument type!");
1968
1969      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1970      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1971
1972      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1973      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1974      // right size.
1975      if (VA.getLocInfo() == CCValAssign::SExt)
1976        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1977                               DAG.getValueType(VA.getValVT()));
1978      else if (VA.getLocInfo() == CCValAssign::ZExt)
1979        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1980                               DAG.getValueType(VA.getValVT()));
1981      else if (VA.getLocInfo() == CCValAssign::BCvt)
1982        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
1983
1984      if (VA.isExtInLoc()) {
1985        // Handle MMX values passed in XMM regs.
1986        if (RegVT.isVector()) {
1987          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
1988                                 ArgValue);
1989        } else
1990          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1991      }
1992    } else {
1993      assert(VA.isMemLoc());
1994      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1995    }
1996
1997    // If value is passed via pointer - do a load.
1998    if (VA.getLocInfo() == CCValAssign::Indirect)
1999      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2000                             MachinePointerInfo(), false, false, false, 0);
2001
2002    InVals.push_back(ArgValue);
2003  }
2004
2005  // The x86-64 ABI for returning structs by value requires that we copy
2006  // the sret argument into %rax for the return. Save the argument into
2007  // a virtual register so that we can access it from the return points.
2008  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
2009    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2010    unsigned Reg = FuncInfo->getSRetReturnReg();
2011    if (!Reg) {
2012      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
2013      FuncInfo->setSRetReturnReg(Reg);
2014    }
2015    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
2016    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2017  }
2018
2019  unsigned StackSize = CCInfo.getNextStackOffset();
2020  // Align stack specially for tail calls.
2021  if (FuncIsMadeTailCallSafe(CallConv,
2022                             MF.getTarget().Options.GuaranteedTailCallOpt))
2023    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2024
2025  // If the function takes variable number of arguments, make a frame index for
2026  // the start of the first vararg value... for expansion of llvm.va_start.
2027  if (isVarArg) {
2028    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2029                    CallConv != CallingConv::X86_ThisCall)) {
2030      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
2031    }
2032    if (Is64Bit) {
2033      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
2034
2035      // FIXME: We should really autogenerate these arrays
2036      static const uint16_t GPR64ArgRegsWin64[] = {
2037        X86::RCX, X86::RDX, X86::R8,  X86::R9
2038      };
2039      static const uint16_t GPR64ArgRegs64Bit[] = {
2040        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2041      };
2042      static const uint16_t XMMArgRegs64Bit[] = {
2043        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2044        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2045      };
2046      const uint16_t *GPR64ArgRegs;
2047      unsigned NumXMMRegs = 0;
2048
2049      if (IsWin64) {
2050        // The XMM registers which might contain var arg parameters are shadowed
2051        // in their paired GPR.  So we only need to save the GPR to their home
2052        // slots.
2053        TotalNumIntRegs = 4;
2054        GPR64ArgRegs = GPR64ArgRegsWin64;
2055      } else {
2056        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
2057        GPR64ArgRegs = GPR64ArgRegs64Bit;
2058
2059        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
2060                                                TotalNumXMMRegs);
2061      }
2062      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
2063                                                       TotalNumIntRegs);
2064
2065      bool NoImplicitFloatOps = Fn->getFnAttributes().
2066        hasAttribute(Attribute::NoImplicitFloat);
2067      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2068             "SSE register cannot be used when SSE is disabled!");
2069      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
2070               NoImplicitFloatOps) &&
2071             "SSE register cannot be used when SSE is disabled!");
2072      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2073          !Subtarget->hasSSE1())
2074        // Kernel mode asks for SSE to be disabled, so don't push them
2075        // on the stack.
2076        TotalNumXMMRegs = 0;
2077
2078      if (IsWin64) {
2079        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
2080        // Get to the caller-allocated home save location.  Add 8 to account
2081        // for the return address.
2082        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2083        FuncInfo->setRegSaveFrameIndex(
2084          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2085        // Fixup to set vararg frame on shadow area (4 x i64).
2086        if (NumIntRegs < 4)
2087          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2088      } else {
2089        // For X86-64, if there are vararg parameters that are passed via
2090        // registers, then we must store them to their spots on the stack so
2091        // they may be loaded by deferencing the result of va_next.
2092        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2093        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2094        FuncInfo->setRegSaveFrameIndex(
2095          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2096                               false));
2097      }
2098
2099      // Store the integer parameter registers.
2100      SmallVector<SDValue, 8> MemOps;
2101      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2102                                        getPointerTy());
2103      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2104      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2105        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2106                                  DAG.getIntPtrConstant(Offset));
2107        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2108                                     &X86::GR64RegClass);
2109        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2110        SDValue Store =
2111          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2112                       MachinePointerInfo::getFixedStack(
2113                         FuncInfo->getRegSaveFrameIndex(), Offset),
2114                       false, false, 0);
2115        MemOps.push_back(Store);
2116        Offset += 8;
2117      }
2118
2119      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2120        // Now store the XMM (fp + vector) parameter registers.
2121        SmallVector<SDValue, 11> SaveXMMOps;
2122        SaveXMMOps.push_back(Chain);
2123
2124        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2125        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2126        SaveXMMOps.push_back(ALVal);
2127
2128        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2129                               FuncInfo->getRegSaveFrameIndex()));
2130        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2131                               FuncInfo->getVarArgsFPOffset()));
2132
2133        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2134          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2135                                       &X86::VR128RegClass);
2136          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2137          SaveXMMOps.push_back(Val);
2138        }
2139        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2140                                     MVT::Other,
2141                                     &SaveXMMOps[0], SaveXMMOps.size()));
2142      }
2143
2144      if (!MemOps.empty())
2145        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2146                            &MemOps[0], MemOps.size());
2147    }
2148  }
2149
2150  // Some CCs need callee pop.
2151  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2152                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2153    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2154  } else {
2155    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2156    // If this is an sret function, the return should pop the hidden pointer.
2157    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2158        argsAreStructReturn(Ins) == StackStructReturn)
2159      FuncInfo->setBytesToPopOnReturn(4);
2160  }
2161
2162  if (!Is64Bit) {
2163    // RegSaveFrameIndex is X86-64 only.
2164    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2165    if (CallConv == CallingConv::X86_FastCall ||
2166        CallConv == CallingConv::X86_ThisCall)
2167      // fastcc functions can't have varargs.
2168      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2169  }
2170
2171  FuncInfo->setArgumentStackSize(StackSize);
2172
2173  return Chain;
2174}
2175
2176SDValue
2177X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2178                                    SDValue StackPtr, SDValue Arg,
2179                                    DebugLoc dl, SelectionDAG &DAG,
2180                                    const CCValAssign &VA,
2181                                    ISD::ArgFlagsTy Flags) const {
2182  unsigned LocMemOffset = VA.getLocMemOffset();
2183  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2184  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2185  if (Flags.isByVal())
2186    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2187
2188  return DAG.getStore(Chain, dl, Arg, PtrOff,
2189                      MachinePointerInfo::getStack(LocMemOffset),
2190                      false, false, 0);
2191}
2192
2193/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2194/// optimization is performed and it is required.
2195SDValue
2196X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2197                                           SDValue &OutRetAddr, SDValue Chain,
2198                                           bool IsTailCall, bool Is64Bit,
2199                                           int FPDiff, DebugLoc dl) const {
2200  // Adjust the Return address stack slot.
2201  EVT VT = getPointerTy();
2202  OutRetAddr = getReturnAddressFrameIndex(DAG);
2203
2204  // Load the "old" Return address.
2205  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2206                           false, false, false, 0);
2207  return SDValue(OutRetAddr.getNode(), 1);
2208}
2209
2210/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2211/// optimization is performed and it is required (FPDiff!=0).
2212static SDValue
2213EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
2214                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
2215                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
2216  // Store the return address to the appropriate stack slot.
2217  if (!FPDiff) return Chain;
2218  // Calculate the new stack slot for the return address.
2219  int NewReturnAddrFI =
2220    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
2221  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2222  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2223                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2224                       false, false, 0);
2225  return Chain;
2226}
2227
2228SDValue
2229X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2230                             SmallVectorImpl<SDValue> &InVals) const {
2231  SelectionDAG &DAG                     = CLI.DAG;
2232  DebugLoc &dl                          = CLI.DL;
2233  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2234  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
2235  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
2236  SDValue Chain                         = CLI.Chain;
2237  SDValue Callee                        = CLI.Callee;
2238  CallingConv::ID CallConv              = CLI.CallConv;
2239  bool &isTailCall                      = CLI.IsTailCall;
2240  bool isVarArg                         = CLI.IsVarArg;
2241
2242  MachineFunction &MF = DAG.getMachineFunction();
2243  bool Is64Bit        = Subtarget->is64Bit();
2244  bool IsWin64        = Subtarget->isTargetWin64();
2245  bool IsWindows      = Subtarget->isTargetWindows();
2246  StructReturnType SR = callIsStructReturn(Outs);
2247  bool IsSibcall      = false;
2248
2249  if (MF.getTarget().Options.DisableTailCalls)
2250    isTailCall = false;
2251
2252  if (isTailCall) {
2253    // Check if it's really possible to do a tail call.
2254    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2255                    isVarArg, SR != NotStructReturn,
2256                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2257                    Outs, OutVals, Ins, DAG);
2258
2259    // Sibcalls are automatically detected tailcalls which do not require
2260    // ABI changes.
2261    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2262      IsSibcall = true;
2263
2264    if (isTailCall)
2265      ++NumTailCalls;
2266  }
2267
2268  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2269         "Var args not supported with calling convention fastcc, ghc or hipe");
2270
2271  // Analyze operands of the call, assigning locations to each operand.
2272  SmallVector<CCValAssign, 16> ArgLocs;
2273  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2274                 ArgLocs, *DAG.getContext());
2275
2276  // Allocate shadow area for Win64
2277  if (IsWin64) {
2278    CCInfo.AllocateStack(32, 8);
2279  }
2280
2281  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2282
2283  // Get a count of how many bytes are to be pushed on the stack.
2284  unsigned NumBytes = CCInfo.getNextStackOffset();
2285  if (IsSibcall)
2286    // This is a sibcall. The memory operands are available in caller's
2287    // own caller's stack.
2288    NumBytes = 0;
2289  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2290           IsTailCallConvention(CallConv))
2291    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2292
2293  int FPDiff = 0;
2294  if (isTailCall && !IsSibcall) {
2295    // Lower arguments at fp - stackoffset + fpdiff.
2296    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2297    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2298
2299    FPDiff = NumBytesCallerPushed - NumBytes;
2300
2301    // Set the delta of movement of the returnaddr stackslot.
2302    // But only set if delta is greater than previous delta.
2303    if (FPDiff < X86Info->getTCReturnAddrDelta())
2304      X86Info->setTCReturnAddrDelta(FPDiff);
2305  }
2306
2307  if (!IsSibcall)
2308    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
2309
2310  SDValue RetAddrFrIdx;
2311  // Load return address for tail calls.
2312  if (isTailCall && FPDiff)
2313    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2314                                    Is64Bit, FPDiff, dl);
2315
2316  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2317  SmallVector<SDValue, 8> MemOpChains;
2318  SDValue StackPtr;
2319
2320  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2321  // of tail call optimization arguments are handle later.
2322  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2323    CCValAssign &VA = ArgLocs[i];
2324    EVT RegVT = VA.getLocVT();
2325    SDValue Arg = OutVals[i];
2326    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2327    bool isByVal = Flags.isByVal();
2328
2329    // Promote the value if needed.
2330    switch (VA.getLocInfo()) {
2331    default: llvm_unreachable("Unknown loc info!");
2332    case CCValAssign::Full: break;
2333    case CCValAssign::SExt:
2334      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2335      break;
2336    case CCValAssign::ZExt:
2337      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2338      break;
2339    case CCValAssign::AExt:
2340      if (RegVT.is128BitVector()) {
2341        // Special case: passing MMX values in XMM registers.
2342        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2343        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2344        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2345      } else
2346        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2347      break;
2348    case CCValAssign::BCvt:
2349      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2350      break;
2351    case CCValAssign::Indirect: {
2352      // Store the argument.
2353      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2354      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2355      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2356                           MachinePointerInfo::getFixedStack(FI),
2357                           false, false, 0);
2358      Arg = SpillSlot;
2359      break;
2360    }
2361    }
2362
2363    if (VA.isRegLoc()) {
2364      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2365      if (isVarArg && IsWin64) {
2366        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2367        // shadow reg if callee is a varargs function.
2368        unsigned ShadowReg = 0;
2369        switch (VA.getLocReg()) {
2370        case X86::XMM0: ShadowReg = X86::RCX; break;
2371        case X86::XMM1: ShadowReg = X86::RDX; break;
2372        case X86::XMM2: ShadowReg = X86::R8; break;
2373        case X86::XMM3: ShadowReg = X86::R9; break;
2374        }
2375        if (ShadowReg)
2376          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2377      }
2378    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2379      assert(VA.isMemLoc());
2380      if (StackPtr.getNode() == 0)
2381        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2382                                      getPointerTy());
2383      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2384                                             dl, DAG, VA, Flags));
2385    }
2386  }
2387
2388  if (!MemOpChains.empty())
2389    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2390                        &MemOpChains[0], MemOpChains.size());
2391
2392  if (Subtarget->isPICStyleGOT()) {
2393    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2394    // GOT pointer.
2395    if (!isTailCall) {
2396      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2397               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
2398    } else {
2399      // If we are tail calling and generating PIC/GOT style code load the
2400      // address of the callee into ECX. The value in ecx is used as target of
2401      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2402      // for tail calls on PIC/GOT architectures. Normally we would just put the
2403      // address of GOT into ebx and then call target@PLT. But for tail calls
2404      // ebx would be restored (since ebx is callee saved) before jumping to the
2405      // target@PLT.
2406
2407      // Note: The actual moving to ECX is done further down.
2408      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2409      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2410          !G->getGlobal()->hasProtectedVisibility())
2411        Callee = LowerGlobalAddress(Callee, DAG);
2412      else if (isa<ExternalSymbolSDNode>(Callee))
2413        Callee = LowerExternalSymbol(Callee, DAG);
2414    }
2415  }
2416
2417  if (Is64Bit && isVarArg && !IsWin64) {
2418    // From AMD64 ABI document:
2419    // For calls that may call functions that use varargs or stdargs
2420    // (prototype-less calls or calls to functions containing ellipsis (...) in
2421    // the declaration) %al is used as hidden argument to specify the number
2422    // of SSE registers used. The contents of %al do not need to match exactly
2423    // the number of registers, but must be an ubound on the number of SSE
2424    // registers used and is in the range 0 - 8 inclusive.
2425
2426    // Count the number of XMM registers allocated.
2427    static const uint16_t XMMArgRegs[] = {
2428      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2429      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2430    };
2431    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2432    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2433           && "SSE registers cannot be used when SSE is disabled");
2434
2435    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2436                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
2437  }
2438
2439  // For tail calls lower the arguments to the 'real' stack slot.
2440  if (isTailCall) {
2441    // Force all the incoming stack arguments to be loaded from the stack
2442    // before any new outgoing arguments are stored to the stack, because the
2443    // outgoing stack slots may alias the incoming argument stack slots, and
2444    // the alias isn't otherwise explicit. This is slightly more conservative
2445    // than necessary, because it means that each store effectively depends
2446    // on every argument instead of just those arguments it would clobber.
2447    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2448
2449    SmallVector<SDValue, 8> MemOpChains2;
2450    SDValue FIN;
2451    int FI = 0;
2452    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2453      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2454        CCValAssign &VA = ArgLocs[i];
2455        if (VA.isRegLoc())
2456          continue;
2457        assert(VA.isMemLoc());
2458        SDValue Arg = OutVals[i];
2459        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2460        // Create frame index.
2461        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2462        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2463        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2464        FIN = DAG.getFrameIndex(FI, getPointerTy());
2465
2466        if (Flags.isByVal()) {
2467          // Copy relative to framepointer.
2468          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2469          if (StackPtr.getNode() == 0)
2470            StackPtr = DAG.getCopyFromReg(Chain, dl,
2471                                          RegInfo->getStackRegister(),
2472                                          getPointerTy());
2473          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2474
2475          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2476                                                           ArgChain,
2477                                                           Flags, DAG, dl));
2478        } else {
2479          // Store relative to framepointer.
2480          MemOpChains2.push_back(
2481            DAG.getStore(ArgChain, dl, Arg, FIN,
2482                         MachinePointerInfo::getFixedStack(FI),
2483                         false, false, 0));
2484        }
2485      }
2486    }
2487
2488    if (!MemOpChains2.empty())
2489      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2490                          &MemOpChains2[0], MemOpChains2.size());
2491
2492    // Store the return address to the appropriate stack slot.
2493    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2494                                     getPointerTy(), RegInfo->getSlotSize(),
2495                                     FPDiff, dl);
2496  }
2497
2498  // Build a sequence of copy-to-reg nodes chained together with token chain
2499  // and flag operands which copy the outgoing args into registers.
2500  SDValue InFlag;
2501  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2502    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2503                             RegsToPass[i].second, InFlag);
2504    InFlag = Chain.getValue(1);
2505  }
2506
2507  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2508    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2509    // In the 64-bit large code model, we have to make all calls
2510    // through a register, since the call instruction's 32-bit
2511    // pc-relative offset may not be large enough to hold the whole
2512    // address.
2513  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2514    // If the callee is a GlobalAddress node (quite common, every direct call
2515    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2516    // it.
2517
2518    // We should use extra load for direct calls to dllimported functions in
2519    // non-JIT mode.
2520    const GlobalValue *GV = G->getGlobal();
2521    if (!GV->hasDLLImportLinkage()) {
2522      unsigned char OpFlags = 0;
2523      bool ExtraLoad = false;
2524      unsigned WrapperKind = ISD::DELETED_NODE;
2525
2526      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2527      // external symbols most go through the PLT in PIC mode.  If the symbol
2528      // has hidden or protected visibility, or if it is static or local, then
2529      // we don't need to use the PLT - we can directly call it.
2530      if (Subtarget->isTargetELF() &&
2531          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2532          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2533        OpFlags = X86II::MO_PLT;
2534      } else if (Subtarget->isPICStyleStubAny() &&
2535                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2536                 (!Subtarget->getTargetTriple().isMacOSX() ||
2537                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2538        // PC-relative references to external symbols should go through $stub,
2539        // unless we're building with the leopard linker or later, which
2540        // automatically synthesizes these stubs.
2541        OpFlags = X86II::MO_DARWIN_STUB;
2542      } else if (Subtarget->isPICStyleRIPRel() &&
2543                 isa<Function>(GV) &&
2544                 cast<Function>(GV)->getFnAttributes().
2545                   hasAttribute(Attribute::NonLazyBind)) {
2546        // If the function is marked as non-lazy, generate an indirect call
2547        // which loads from the GOT directly. This avoids runtime overhead
2548        // at the cost of eager binding (and one extra byte of encoding).
2549        OpFlags = X86II::MO_GOTPCREL;
2550        WrapperKind = X86ISD::WrapperRIP;
2551        ExtraLoad = true;
2552      }
2553
2554      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2555                                          G->getOffset(), OpFlags);
2556
2557      // Add a wrapper if needed.
2558      if (WrapperKind != ISD::DELETED_NODE)
2559        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2560      // Add extra indirection if needed.
2561      if (ExtraLoad)
2562        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2563                             MachinePointerInfo::getGOT(),
2564                             false, false, false, 0);
2565    }
2566  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2567    unsigned char OpFlags = 0;
2568
2569    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2570    // external symbols should go through the PLT.
2571    if (Subtarget->isTargetELF() &&
2572        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2573      OpFlags = X86II::MO_PLT;
2574    } else if (Subtarget->isPICStyleStubAny() &&
2575               (!Subtarget->getTargetTriple().isMacOSX() ||
2576                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2577      // PC-relative references to external symbols should go through $stub,
2578      // unless we're building with the leopard linker or later, which
2579      // automatically synthesizes these stubs.
2580      OpFlags = X86II::MO_DARWIN_STUB;
2581    }
2582
2583    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2584                                         OpFlags);
2585  }
2586
2587  // Returns a chain & a flag for retval copy to use.
2588  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2589  SmallVector<SDValue, 8> Ops;
2590
2591  if (!IsSibcall && isTailCall) {
2592    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2593                           DAG.getIntPtrConstant(0, true), InFlag);
2594    InFlag = Chain.getValue(1);
2595  }
2596
2597  Ops.push_back(Chain);
2598  Ops.push_back(Callee);
2599
2600  if (isTailCall)
2601    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2602
2603  // Add argument registers to the end of the list so that they are known live
2604  // into the call.
2605  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2606    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2607                                  RegsToPass[i].second.getValueType()));
2608
2609  // Add a register mask operand representing the call-preserved registers.
2610  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2611  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2612  assert(Mask && "Missing call preserved mask for calling convention");
2613  Ops.push_back(DAG.getRegisterMask(Mask));
2614
2615  if (InFlag.getNode())
2616    Ops.push_back(InFlag);
2617
2618  if (isTailCall) {
2619    // We used to do:
2620    //// If this is the first return lowered for this function, add the regs
2621    //// to the liveout set for the function.
2622    // This isn't right, although it's probably harmless on x86; liveouts
2623    // should be computed from returns not tail calls.  Consider a void
2624    // function making a tail call to a function returning int.
2625    return DAG.getNode(X86ISD::TC_RETURN, dl,
2626                       NodeTys, &Ops[0], Ops.size());
2627  }
2628
2629  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2630  InFlag = Chain.getValue(1);
2631
2632  // Create the CALLSEQ_END node.
2633  unsigned NumBytesForCalleeToPush;
2634  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2635                       getTargetMachine().Options.GuaranteedTailCallOpt))
2636    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2637  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2638           SR == StackStructReturn)
2639    // If this is a call to a struct-return function, the callee
2640    // pops the hidden struct pointer, so we have to push it back.
2641    // This is common for Darwin/X86, Linux & Mingw32 targets.
2642    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2643    NumBytesForCalleeToPush = 4;
2644  else
2645    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2646
2647  // Returns a flag for retval copy to use.
2648  if (!IsSibcall) {
2649    Chain = DAG.getCALLSEQ_END(Chain,
2650                               DAG.getIntPtrConstant(NumBytes, true),
2651                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2652                                                     true),
2653                               InFlag);
2654    InFlag = Chain.getValue(1);
2655  }
2656
2657  // Handle result values, copying them out of physregs into vregs that we
2658  // return.
2659  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2660                         Ins, dl, DAG, InVals);
2661}
2662
2663//===----------------------------------------------------------------------===//
2664//                Fast Calling Convention (tail call) implementation
2665//===----------------------------------------------------------------------===//
2666
2667//  Like std call, callee cleans arguments, convention except that ECX is
2668//  reserved for storing the tail called function address. Only 2 registers are
2669//  free for argument passing (inreg). Tail call optimization is performed
2670//  provided:
2671//                * tailcallopt is enabled
2672//                * caller/callee are fastcc
2673//  On X86_64 architecture with GOT-style position independent code only local
2674//  (within module) calls are supported at the moment.
2675//  To keep the stack aligned according to platform abi the function
2676//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2677//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2678//  If a tail called function callee has more arguments than the caller the
2679//  caller needs to make sure that there is room to move the RETADDR to. This is
2680//  achieved by reserving an area the size of the argument delta right after the
2681//  original REtADDR, but before the saved framepointer or the spilled registers
2682//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2683//  stack layout:
2684//    arg1
2685//    arg2
2686//    RETADDR
2687//    [ new RETADDR
2688//      move area ]
2689//    (possible EBP)
2690//    ESI
2691//    EDI
2692//    local1 ..
2693
2694/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2695/// for a 16 byte align requirement.
2696unsigned
2697X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2698                                               SelectionDAG& DAG) const {
2699  MachineFunction &MF = DAG.getMachineFunction();
2700  const TargetMachine &TM = MF.getTarget();
2701  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2702  unsigned StackAlignment = TFI.getStackAlignment();
2703  uint64_t AlignMask = StackAlignment - 1;
2704  int64_t Offset = StackSize;
2705  unsigned SlotSize = RegInfo->getSlotSize();
2706  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2707    // Number smaller than 12 so just add the difference.
2708    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2709  } else {
2710    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2711    Offset = ((~AlignMask) & Offset) + StackAlignment +
2712      (StackAlignment-SlotSize);
2713  }
2714  return Offset;
2715}
2716
2717/// MatchingStackOffset - Return true if the given stack call argument is
2718/// already available in the same position (relatively) of the caller's
2719/// incoming argument stack.
2720static
2721bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2722                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2723                         const X86InstrInfo *TII) {
2724  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2725  int FI = INT_MAX;
2726  if (Arg.getOpcode() == ISD::CopyFromReg) {
2727    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2728    if (!TargetRegisterInfo::isVirtualRegister(VR))
2729      return false;
2730    MachineInstr *Def = MRI->getVRegDef(VR);
2731    if (!Def)
2732      return false;
2733    if (!Flags.isByVal()) {
2734      if (!TII->isLoadFromStackSlot(Def, FI))
2735        return false;
2736    } else {
2737      unsigned Opcode = Def->getOpcode();
2738      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2739          Def->getOperand(1).isFI()) {
2740        FI = Def->getOperand(1).getIndex();
2741        Bytes = Flags.getByValSize();
2742      } else
2743        return false;
2744    }
2745  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2746    if (Flags.isByVal())
2747      // ByVal argument is passed in as a pointer but it's now being
2748      // dereferenced. e.g.
2749      // define @foo(%struct.X* %A) {
2750      //   tail call @bar(%struct.X* byval %A)
2751      // }
2752      return false;
2753    SDValue Ptr = Ld->getBasePtr();
2754    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2755    if (!FINode)
2756      return false;
2757    FI = FINode->getIndex();
2758  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2759    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2760    FI = FINode->getIndex();
2761    Bytes = Flags.getByValSize();
2762  } else
2763    return false;
2764
2765  assert(FI != INT_MAX);
2766  if (!MFI->isFixedObjectIndex(FI))
2767    return false;
2768  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2769}
2770
2771/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2772/// for tail call optimization. Targets which want to do tail call
2773/// optimization should implement this function.
2774bool
2775X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2776                                                     CallingConv::ID CalleeCC,
2777                                                     bool isVarArg,
2778                                                     bool isCalleeStructRet,
2779                                                     bool isCallerStructRet,
2780                                                     Type *RetTy,
2781                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2782                                    const SmallVectorImpl<SDValue> &OutVals,
2783                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2784                                                     SelectionDAG& DAG) const {
2785  if (!IsTailCallConvention(CalleeCC) &&
2786      CalleeCC != CallingConv::C)
2787    return false;
2788
2789  // If -tailcallopt is specified, make fastcc functions tail-callable.
2790  const MachineFunction &MF = DAG.getMachineFunction();
2791  const Function *CallerF = DAG.getMachineFunction().getFunction();
2792
2793  // If the function return type is x86_fp80 and the callee return type is not,
2794  // then the FP_EXTEND of the call result is not a nop. It's not safe to
2795  // perform a tailcall optimization here.
2796  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
2797    return false;
2798
2799  CallingConv::ID CallerCC = CallerF->getCallingConv();
2800  bool CCMatch = CallerCC == CalleeCC;
2801
2802  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2803    if (IsTailCallConvention(CalleeCC) && CCMatch)
2804      return true;
2805    return false;
2806  }
2807
2808  // Look for obvious safe cases to perform tail call optimization that do not
2809  // require ABI changes. This is what gcc calls sibcall.
2810
2811  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2812  // emit a special epilogue.
2813  if (RegInfo->needsStackRealignment(MF))
2814    return false;
2815
2816  // Also avoid sibcall optimization if either caller or callee uses struct
2817  // return semantics.
2818  if (isCalleeStructRet || isCallerStructRet)
2819    return false;
2820
2821  // An stdcall caller is expected to clean up its arguments; the callee
2822  // isn't going to do that.
2823  if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
2824    return false;
2825
2826  // Do not sibcall optimize vararg calls unless all arguments are passed via
2827  // registers.
2828  if (isVarArg && !Outs.empty()) {
2829
2830    // Optimizing for varargs on Win64 is unlikely to be safe without
2831    // additional testing.
2832    if (Subtarget->isTargetWin64())
2833      return false;
2834
2835    SmallVector<CCValAssign, 16> ArgLocs;
2836    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2837                   getTargetMachine(), ArgLocs, *DAG.getContext());
2838
2839    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2840    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2841      if (!ArgLocs[i].isRegLoc())
2842        return false;
2843  }
2844
2845  // If the call result is in ST0 / ST1, it needs to be popped off the x87
2846  // stack.  Therefore, if it's not used by the call it is not safe to optimize
2847  // this into a sibcall.
2848  bool Unused = false;
2849  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2850    if (!Ins[i].Used) {
2851      Unused = true;
2852      break;
2853    }
2854  }
2855  if (Unused) {
2856    SmallVector<CCValAssign, 16> RVLocs;
2857    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
2858                   getTargetMachine(), RVLocs, *DAG.getContext());
2859    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2860    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2861      CCValAssign &VA = RVLocs[i];
2862      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2863        return false;
2864    }
2865  }
2866
2867  // If the calling conventions do not match, then we'd better make sure the
2868  // results are returned in the same way as what the caller expects.
2869  if (!CCMatch) {
2870    SmallVector<CCValAssign, 16> RVLocs1;
2871    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2872                    getTargetMachine(), RVLocs1, *DAG.getContext());
2873    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2874
2875    SmallVector<CCValAssign, 16> RVLocs2;
2876    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2877                    getTargetMachine(), RVLocs2, *DAG.getContext());
2878    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2879
2880    if (RVLocs1.size() != RVLocs2.size())
2881      return false;
2882    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2883      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2884        return false;
2885      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2886        return false;
2887      if (RVLocs1[i].isRegLoc()) {
2888        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2889          return false;
2890      } else {
2891        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2892          return false;
2893      }
2894    }
2895  }
2896
2897  // If the callee takes no arguments then go on to check the results of the
2898  // call.
2899  if (!Outs.empty()) {
2900    // Check if stack adjustment is needed. For now, do not do this if any
2901    // argument is passed on the stack.
2902    SmallVector<CCValAssign, 16> ArgLocs;
2903    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2904                   getTargetMachine(), ArgLocs, *DAG.getContext());
2905
2906    // Allocate shadow area for Win64
2907    if (Subtarget->isTargetWin64()) {
2908      CCInfo.AllocateStack(32, 8);
2909    }
2910
2911    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2912    if (CCInfo.getNextStackOffset()) {
2913      MachineFunction &MF = DAG.getMachineFunction();
2914      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2915        return false;
2916
2917      // Check if the arguments are already laid out in the right way as
2918      // the caller's fixed stack objects.
2919      MachineFrameInfo *MFI = MF.getFrameInfo();
2920      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2921      const X86InstrInfo *TII =
2922        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
2923      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2924        CCValAssign &VA = ArgLocs[i];
2925        SDValue Arg = OutVals[i];
2926        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2927        if (VA.getLocInfo() == CCValAssign::Indirect)
2928          return false;
2929        if (!VA.isRegLoc()) {
2930          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2931                                   MFI, MRI, TII))
2932            return false;
2933        }
2934      }
2935    }
2936
2937    // If the tailcall address may be in a register, then make sure it's
2938    // possible to register allocate for it. In 32-bit, the call address can
2939    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2940    // callee-saved registers are restored. These happen to be the same
2941    // registers used to pass 'inreg' arguments so watch out for those.
2942    if (!Subtarget->is64Bit() &&
2943        !isa<GlobalAddressSDNode>(Callee) &&
2944        !isa<ExternalSymbolSDNode>(Callee)) {
2945      unsigned NumInRegs = 0;
2946      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2947        CCValAssign &VA = ArgLocs[i];
2948        if (!VA.isRegLoc())
2949          continue;
2950        unsigned Reg = VA.getLocReg();
2951        switch (Reg) {
2952        default: break;
2953        case X86::EAX: case X86::EDX: case X86::ECX:
2954          if (++NumInRegs == 3)
2955            return false;
2956          break;
2957        }
2958      }
2959    }
2960  }
2961
2962  return true;
2963}
2964
2965FastISel *
2966X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2967                                  const TargetLibraryInfo *libInfo) const {
2968  return X86::createFastISel(funcInfo, libInfo);
2969}
2970
2971//===----------------------------------------------------------------------===//
2972//                           Other Lowering Hooks
2973//===----------------------------------------------------------------------===//
2974
2975static bool MayFoldLoad(SDValue Op) {
2976  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
2977}
2978
2979static bool MayFoldIntoStore(SDValue Op) {
2980  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2981}
2982
2983static bool isTargetShuffle(unsigned Opcode) {
2984  switch(Opcode) {
2985  default: return false;
2986  case X86ISD::PSHUFD:
2987  case X86ISD::PSHUFHW:
2988  case X86ISD::PSHUFLW:
2989  case X86ISD::SHUFP:
2990  case X86ISD::PALIGN:
2991  case X86ISD::MOVLHPS:
2992  case X86ISD::MOVLHPD:
2993  case X86ISD::MOVHLPS:
2994  case X86ISD::MOVLPS:
2995  case X86ISD::MOVLPD:
2996  case X86ISD::MOVSHDUP:
2997  case X86ISD::MOVSLDUP:
2998  case X86ISD::MOVDDUP:
2999  case X86ISD::MOVSS:
3000  case X86ISD::MOVSD:
3001  case X86ISD::UNPCKL:
3002  case X86ISD::UNPCKH:
3003  case X86ISD::VPERMILP:
3004  case X86ISD::VPERM2X128:
3005  case X86ISD::VPERMI:
3006    return true;
3007  }
3008}
3009
3010static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3011                                    SDValue V1, SelectionDAG &DAG) {
3012  switch(Opc) {
3013  default: llvm_unreachable("Unknown x86 shuffle node");
3014  case X86ISD::MOVSHDUP:
3015  case X86ISD::MOVSLDUP:
3016  case X86ISD::MOVDDUP:
3017    return DAG.getNode(Opc, dl, VT, V1);
3018  }
3019}
3020
3021static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3022                                    SDValue V1, unsigned TargetMask,
3023                                    SelectionDAG &DAG) {
3024  switch(Opc) {
3025  default: llvm_unreachable("Unknown x86 shuffle node");
3026  case X86ISD::PSHUFD:
3027  case X86ISD::PSHUFHW:
3028  case X86ISD::PSHUFLW:
3029  case X86ISD::VPERMILP:
3030  case X86ISD::VPERMI:
3031    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3032  }
3033}
3034
3035static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3036                                    SDValue V1, SDValue V2, unsigned TargetMask,
3037                                    SelectionDAG &DAG) {
3038  switch(Opc) {
3039  default: llvm_unreachable("Unknown x86 shuffle node");
3040  case X86ISD::PALIGN:
3041  case X86ISD::SHUFP:
3042  case X86ISD::VPERM2X128:
3043    return DAG.getNode(Opc, dl, VT, V1, V2,
3044                       DAG.getConstant(TargetMask, MVT::i8));
3045  }
3046}
3047
3048static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
3049                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
3050  switch(Opc) {
3051  default: llvm_unreachable("Unknown x86 shuffle node");
3052  case X86ISD::MOVLHPS:
3053  case X86ISD::MOVLHPD:
3054  case X86ISD::MOVHLPS:
3055  case X86ISD::MOVLPS:
3056  case X86ISD::MOVLPD:
3057  case X86ISD::MOVSS:
3058  case X86ISD::MOVSD:
3059  case X86ISD::UNPCKL:
3060  case X86ISD::UNPCKH:
3061    return DAG.getNode(Opc, dl, VT, V1, V2);
3062  }
3063}
3064
3065SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3066  MachineFunction &MF = DAG.getMachineFunction();
3067  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3068  int ReturnAddrIndex = FuncInfo->getRAIndex();
3069
3070  if (ReturnAddrIndex == 0) {
3071    // Set up a frame object for the return address.
3072    unsigned SlotSize = RegInfo->getSlotSize();
3073    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
3074                                                           false);
3075    FuncInfo->setRAIndex(ReturnAddrIndex);
3076  }
3077
3078  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3079}
3080
3081bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3082                                       bool hasSymbolicDisplacement) {
3083  // Offset should fit into 32 bit immediate field.
3084  if (!isInt<32>(Offset))
3085    return false;
3086
3087  // If we don't have a symbolic displacement - we don't have any extra
3088  // restrictions.
3089  if (!hasSymbolicDisplacement)
3090    return true;
3091
3092  // FIXME: Some tweaks might be needed for medium code model.
3093  if (M != CodeModel::Small && M != CodeModel::Kernel)
3094    return false;
3095
3096  // For small code model we assume that latest object is 16MB before end of 31
3097  // bits boundary. We may also accept pretty large negative constants knowing
3098  // that all objects are in the positive half of address space.
3099  if (M == CodeModel::Small && Offset < 16*1024*1024)
3100    return true;
3101
3102  // For kernel code model we know that all object resist in the negative half
3103  // of 32bits address space. We may not accept negative offsets, since they may
3104  // be just off and we may accept pretty large positive ones.
3105  if (M == CodeModel::Kernel && Offset > 0)
3106    return true;
3107
3108  return false;
3109}
3110
3111/// isCalleePop - Determines whether the callee is required to pop its
3112/// own arguments. Callee pop is necessary to support tail calls.
3113bool X86::isCalleePop(CallingConv::ID CallingConv,
3114                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3115  if (IsVarArg)
3116    return false;
3117
3118  switch (CallingConv) {
3119  default:
3120    return false;
3121  case CallingConv::X86_StdCall:
3122    return !is64Bit;
3123  case CallingConv::X86_FastCall:
3124    return !is64Bit;
3125  case CallingConv::X86_ThisCall:
3126    return !is64Bit;
3127  case CallingConv::Fast:
3128    return TailCallOpt;
3129  case CallingConv::GHC:
3130    return TailCallOpt;
3131  case CallingConv::HiPE:
3132    return TailCallOpt;
3133  }
3134}
3135
3136/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3137/// specific condition code, returning the condition code and the LHS/RHS of the
3138/// comparison to make.
3139static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3140                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3141  if (!isFP) {
3142    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3143      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3144        // X > -1   -> X == 0, jump !sign.
3145        RHS = DAG.getConstant(0, RHS.getValueType());
3146        return X86::COND_NS;
3147      }
3148      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3149        // X < 0   -> X == 0, jump on sign.
3150        return X86::COND_S;
3151      }
3152      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3153        // X < 1   -> X <= 0
3154        RHS = DAG.getConstant(0, RHS.getValueType());
3155        return X86::COND_LE;
3156      }
3157    }
3158
3159    switch (SetCCOpcode) {
3160    default: llvm_unreachable("Invalid integer condition!");
3161    case ISD::SETEQ:  return X86::COND_E;
3162    case ISD::SETGT:  return X86::COND_G;
3163    case ISD::SETGE:  return X86::COND_GE;
3164    case ISD::SETLT:  return X86::COND_L;
3165    case ISD::SETLE:  return X86::COND_LE;
3166    case ISD::SETNE:  return X86::COND_NE;
3167    case ISD::SETULT: return X86::COND_B;
3168    case ISD::SETUGT: return X86::COND_A;
3169    case ISD::SETULE: return X86::COND_BE;
3170    case ISD::SETUGE: return X86::COND_AE;
3171    }
3172  }
3173
3174  // First determine if it is required or is profitable to flip the operands.
3175
3176  // If LHS is a foldable load, but RHS is not, flip the condition.
3177  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3178      !ISD::isNON_EXTLoad(RHS.getNode())) {
3179    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3180    std::swap(LHS, RHS);
3181  }
3182
3183  switch (SetCCOpcode) {
3184  default: break;
3185  case ISD::SETOLT:
3186  case ISD::SETOLE:
3187  case ISD::SETUGT:
3188  case ISD::SETUGE:
3189    std::swap(LHS, RHS);
3190    break;
3191  }
3192
3193  // On a floating point condition, the flags are set as follows:
3194  // ZF  PF  CF   op
3195  //  0 | 0 | 0 | X > Y
3196  //  0 | 0 | 1 | X < Y
3197  //  1 | 0 | 0 | X == Y
3198  //  1 | 1 | 1 | unordered
3199  switch (SetCCOpcode) {
3200  default: llvm_unreachable("Condcode should be pre-legalized away");
3201  case ISD::SETUEQ:
3202  case ISD::SETEQ:   return X86::COND_E;
3203  case ISD::SETOLT:              // flipped
3204  case ISD::SETOGT:
3205  case ISD::SETGT:   return X86::COND_A;
3206  case ISD::SETOLE:              // flipped
3207  case ISD::SETOGE:
3208  case ISD::SETGE:   return X86::COND_AE;
3209  case ISD::SETUGT:              // flipped
3210  case ISD::SETULT:
3211  case ISD::SETLT:   return X86::COND_B;
3212  case ISD::SETUGE:              // flipped
3213  case ISD::SETULE:
3214  case ISD::SETLE:   return X86::COND_BE;
3215  case ISD::SETONE:
3216  case ISD::SETNE:   return X86::COND_NE;
3217  case ISD::SETUO:   return X86::COND_P;
3218  case ISD::SETO:    return X86::COND_NP;
3219  case ISD::SETOEQ:
3220  case ISD::SETUNE:  return X86::COND_INVALID;
3221  }
3222}
3223
3224/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3225/// code. Current x86 isa includes the following FP cmov instructions:
3226/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3227static bool hasFPCMov(unsigned X86CC) {
3228  switch (X86CC) {
3229  default:
3230    return false;
3231  case X86::COND_B:
3232  case X86::COND_BE:
3233  case X86::COND_E:
3234  case X86::COND_P:
3235  case X86::COND_A:
3236  case X86::COND_AE:
3237  case X86::COND_NE:
3238  case X86::COND_NP:
3239    return true;
3240  }
3241}
3242
3243/// isFPImmLegal - Returns true if the target can instruction select the
3244/// specified FP immediate natively. If false, the legalizer will
3245/// materialize the FP immediate as a load from a constant pool.
3246bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3247  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3248    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3249      return true;
3250  }
3251  return false;
3252}
3253
3254/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3255/// the specified range (L, H].
3256static bool isUndefOrInRange(int Val, int Low, int Hi) {
3257  return (Val < 0) || (Val >= Low && Val < Hi);
3258}
3259
3260/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3261/// specified value.
3262static bool isUndefOrEqual(int Val, int CmpVal) {
3263  return (Val < 0 || Val == CmpVal);
3264}
3265
3266/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3267/// from position Pos and ending in Pos+Size, falls within the specified
3268/// sequential range (L, L+Pos]. or is undef.
3269static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3270                                       unsigned Pos, unsigned Size, int Low) {
3271  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3272    if (!isUndefOrEqual(Mask[i], Low))
3273      return false;
3274  return true;
3275}
3276
3277/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3278/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3279/// the second operand.
3280static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
3281  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3282    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3283  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3284    return (Mask[0] < 2 && Mask[1] < 2);
3285  return false;
3286}
3287
3288/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3289/// is suitable for input to PSHUFHW.
3290static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3291  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3292    return false;
3293
3294  // Lower quadword copied in order or undef.
3295  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3296    return false;
3297
3298  // Upper quadword shuffled.
3299  for (unsigned i = 4; i != 8; ++i)
3300    if (!isUndefOrInRange(Mask[i], 4, 8))
3301      return false;
3302
3303  if (VT == MVT::v16i16) {
3304    // Lower quadword copied in order or undef.
3305    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3306      return false;
3307
3308    // Upper quadword shuffled.
3309    for (unsigned i = 12; i != 16; ++i)
3310      if (!isUndefOrInRange(Mask[i], 12, 16))
3311        return false;
3312  }
3313
3314  return true;
3315}
3316
3317/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3318/// is suitable for input to PSHUFLW.
3319static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3320  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3321    return false;
3322
3323  // Upper quadword copied in order.
3324  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3325    return false;
3326
3327  // Lower quadword shuffled.
3328  for (unsigned i = 0; i != 4; ++i)
3329    if (!isUndefOrInRange(Mask[i], 0, 4))
3330      return false;
3331
3332  if (VT == MVT::v16i16) {
3333    // Upper quadword copied in order.
3334    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3335      return false;
3336
3337    // Lower quadword shuffled.
3338    for (unsigned i = 8; i != 12; ++i)
3339      if (!isUndefOrInRange(Mask[i], 8, 12))
3340        return false;
3341  }
3342
3343  return true;
3344}
3345
3346/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3347/// is suitable for input to PALIGNR.
3348static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
3349                          const X86Subtarget *Subtarget) {
3350  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
3351      (VT.getSizeInBits() == 256 && !Subtarget->hasInt256()))
3352    return false;
3353
3354  unsigned NumElts = VT.getVectorNumElements();
3355  unsigned NumLanes = VT.getSizeInBits()/128;
3356  unsigned NumLaneElts = NumElts/NumLanes;
3357
3358  // Do not handle 64-bit element shuffles with palignr.
3359  if (NumLaneElts == 2)
3360    return false;
3361
3362  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3363    unsigned i;
3364    for (i = 0; i != NumLaneElts; ++i) {
3365      if (Mask[i+l] >= 0)
3366        break;
3367    }
3368
3369    // Lane is all undef, go to next lane
3370    if (i == NumLaneElts)
3371      continue;
3372
3373    int Start = Mask[i+l];
3374
3375    // Make sure its in this lane in one of the sources
3376    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3377        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3378      return false;
3379
3380    // If not lane 0, then we must match lane 0
3381    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3382      return false;
3383
3384    // Correct second source to be contiguous with first source
3385    if (Start >= (int)NumElts)
3386      Start -= NumElts - NumLaneElts;
3387
3388    // Make sure we're shifting in the right direction.
3389    if (Start <= (int)(i+l))
3390      return false;
3391
3392    Start -= i;
3393
3394    // Check the rest of the elements to see if they are consecutive.
3395    for (++i; i != NumLaneElts; ++i) {
3396      int Idx = Mask[i+l];
3397
3398      // Make sure its in this lane
3399      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3400          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3401        return false;
3402
3403      // If not lane 0, then we must match lane 0
3404      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3405        return false;
3406
3407      if (Idx >= (int)NumElts)
3408        Idx -= NumElts - NumLaneElts;
3409
3410      if (!isUndefOrEqual(Idx, Start+i))
3411        return false;
3412
3413    }
3414  }
3415
3416  return true;
3417}
3418
3419/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3420/// the two vector operands have swapped position.
3421static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3422                                     unsigned NumElems) {
3423  for (unsigned i = 0; i != NumElems; ++i) {
3424    int idx = Mask[i];
3425    if (idx < 0)
3426      continue;
3427    else if (idx < (int)NumElems)
3428      Mask[i] = idx + NumElems;
3429    else
3430      Mask[i] = idx - NumElems;
3431  }
3432}
3433
3434/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3435/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3436/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3437/// reverse of what x86 shuffles want.
3438static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
3439                        bool Commuted = false) {
3440  if (!HasFp256 && VT.getSizeInBits() == 256)
3441    return false;
3442
3443  unsigned NumElems = VT.getVectorNumElements();
3444  unsigned NumLanes = VT.getSizeInBits()/128;
3445  unsigned NumLaneElems = NumElems/NumLanes;
3446
3447  if (NumLaneElems != 2 && NumLaneElems != 4)
3448    return false;
3449
3450  // VSHUFPSY divides the resulting vector into 4 chunks.
3451  // The sources are also splitted into 4 chunks, and each destination
3452  // chunk must come from a different source chunk.
3453  //
3454  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3455  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3456  //
3457  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3458  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3459  //
3460  // VSHUFPDY divides the resulting vector into 4 chunks.
3461  // The sources are also splitted into 4 chunks, and each destination
3462  // chunk must come from a different source chunk.
3463  //
3464  //  SRC1 =>      X3       X2       X1       X0
3465  //  SRC2 =>      Y3       Y2       Y1       Y0
3466  //
3467  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3468  //
3469  unsigned HalfLaneElems = NumLaneElems/2;
3470  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3471    for (unsigned i = 0; i != NumLaneElems; ++i) {
3472      int Idx = Mask[i+l];
3473      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3474      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3475        return false;
3476      // For VSHUFPSY, the mask of the second half must be the same as the
3477      // first but with the appropriate offsets. This works in the same way as
3478      // VPERMILPS works with masks.
3479      if (NumElems != 8 || l == 0 || Mask[i] < 0)
3480        continue;
3481      if (!isUndefOrEqual(Idx, Mask[i]+l))
3482        return false;
3483    }
3484  }
3485
3486  return true;
3487}
3488
3489/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3490/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3491static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
3492  if (!VT.is128BitVector())
3493    return false;
3494
3495  unsigned NumElems = VT.getVectorNumElements();
3496
3497  if (NumElems != 4)
3498    return false;
3499
3500  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3501  return isUndefOrEqual(Mask[0], 6) &&
3502         isUndefOrEqual(Mask[1], 7) &&
3503         isUndefOrEqual(Mask[2], 2) &&
3504         isUndefOrEqual(Mask[3], 3);
3505}
3506
3507/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3508/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3509/// <2, 3, 2, 3>
3510static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
3511  if (!VT.is128BitVector())
3512    return false;
3513
3514  unsigned NumElems = VT.getVectorNumElements();
3515
3516  if (NumElems != 4)
3517    return false;
3518
3519  return isUndefOrEqual(Mask[0], 2) &&
3520         isUndefOrEqual(Mask[1], 3) &&
3521         isUndefOrEqual(Mask[2], 2) &&
3522         isUndefOrEqual(Mask[3], 3);
3523}
3524
3525/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3526/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3527static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
3528  if (!VT.is128BitVector())
3529    return false;
3530
3531  unsigned NumElems = VT.getVectorNumElements();
3532
3533  if (NumElems != 2 && NumElems != 4)
3534    return false;
3535
3536  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3537    if (!isUndefOrEqual(Mask[i], i + NumElems))
3538      return false;
3539
3540  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3541    if (!isUndefOrEqual(Mask[i], i))
3542      return false;
3543
3544  return true;
3545}
3546
3547/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3548/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3549static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
3550  if (!VT.is128BitVector())
3551    return false;
3552
3553  unsigned NumElems = VT.getVectorNumElements();
3554
3555  if (NumElems != 2 && NumElems != 4)
3556    return false;
3557
3558  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3559    if (!isUndefOrEqual(Mask[i], i))
3560      return false;
3561
3562  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3563    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3564      return false;
3565
3566  return true;
3567}
3568
3569//
3570// Some special combinations that can be optimized.
3571//
3572static
3573SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
3574                               SelectionDAG &DAG) {
3575  EVT VT = SVOp->getValueType(0);
3576  DebugLoc dl = SVOp->getDebugLoc();
3577
3578  if (VT != MVT::v8i32 && VT != MVT::v8f32)
3579    return SDValue();
3580
3581  ArrayRef<int> Mask = SVOp->getMask();
3582
3583  // These are the special masks that may be optimized.
3584  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
3585  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
3586  bool MatchEvenMask = true;
3587  bool MatchOddMask  = true;
3588  for (int i=0; i<8; ++i) {
3589    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
3590      MatchEvenMask = false;
3591    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
3592      MatchOddMask = false;
3593  }
3594
3595  if (!MatchEvenMask && !MatchOddMask)
3596    return SDValue();
3597
3598  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
3599
3600  SDValue Op0 = SVOp->getOperand(0);
3601  SDValue Op1 = SVOp->getOperand(1);
3602
3603  if (MatchEvenMask) {
3604    // Shift the second operand right to 32 bits.
3605    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
3606    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
3607  } else {
3608    // Shift the first operand left to 32 bits.
3609    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
3610    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
3611  }
3612  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
3613  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
3614}
3615
3616/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3617/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3618static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
3619                         bool HasInt256, bool V2IsSplat = false) {
3620  unsigned NumElts = VT.getVectorNumElements();
3621
3622  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3623         "Unsupported vector type for unpckh");
3624
3625  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3626      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3627    return false;
3628
3629  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3630  // independently on 128-bit lanes.
3631  unsigned NumLanes = VT.getSizeInBits()/128;
3632  unsigned NumLaneElts = NumElts/NumLanes;
3633
3634  for (unsigned l = 0; l != NumLanes; ++l) {
3635    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3636         i != (l+1)*NumLaneElts;
3637         i += 2, ++j) {
3638      int BitI  = Mask[i];
3639      int BitI1 = Mask[i+1];
3640      if (!isUndefOrEqual(BitI, j))
3641        return false;
3642      if (V2IsSplat) {
3643        if (!isUndefOrEqual(BitI1, NumElts))
3644          return false;
3645      } else {
3646        if (!isUndefOrEqual(BitI1, j + NumElts))
3647          return false;
3648      }
3649    }
3650  }
3651
3652  return true;
3653}
3654
3655/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3656/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3657static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
3658                         bool HasInt256, bool V2IsSplat = false) {
3659  unsigned NumElts = VT.getVectorNumElements();
3660
3661  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3662         "Unsupported vector type for unpckh");
3663
3664  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3665      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3666    return false;
3667
3668  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3669  // independently on 128-bit lanes.
3670  unsigned NumLanes = VT.getSizeInBits()/128;
3671  unsigned NumLaneElts = NumElts/NumLanes;
3672
3673  for (unsigned l = 0; l != NumLanes; ++l) {
3674    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3675         i != (l+1)*NumLaneElts; i += 2, ++j) {
3676      int BitI  = Mask[i];
3677      int BitI1 = Mask[i+1];
3678      if (!isUndefOrEqual(BitI, j))
3679        return false;
3680      if (V2IsSplat) {
3681        if (isUndefOrEqual(BitI1, NumElts))
3682          return false;
3683      } else {
3684        if (!isUndefOrEqual(BitI1, j+NumElts))
3685          return false;
3686      }
3687    }
3688  }
3689  return true;
3690}
3691
3692/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3693/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3694/// <0, 0, 1, 1>
3695static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
3696                                  bool HasInt256) {
3697  unsigned NumElts = VT.getVectorNumElements();
3698
3699  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3700         "Unsupported vector type for unpckh");
3701
3702  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3703      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3704    return false;
3705
3706  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
3707  // FIXME: Need a better way to get rid of this, there's no latency difference
3708  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
3709  // the former later. We should also remove the "_undef" special mask.
3710  if (NumElts == 4 && VT.getSizeInBits() == 256)
3711    return false;
3712
3713  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3714  // independently on 128-bit lanes.
3715  unsigned NumLanes = VT.getSizeInBits()/128;
3716  unsigned NumLaneElts = NumElts/NumLanes;
3717
3718  for (unsigned l = 0; l != NumLanes; ++l) {
3719    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3720         i != (l+1)*NumLaneElts;
3721         i += 2, ++j) {
3722      int BitI  = Mask[i];
3723      int BitI1 = Mask[i+1];
3724
3725      if (!isUndefOrEqual(BitI, j))
3726        return false;
3727      if (!isUndefOrEqual(BitI1, j))
3728        return false;
3729    }
3730  }
3731
3732  return true;
3733}
3734
3735/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3736/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3737/// <2, 2, 3, 3>
3738static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
3739  unsigned NumElts = VT.getVectorNumElements();
3740
3741  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3742         "Unsupported vector type for unpckh");
3743
3744  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3745      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
3746    return false;
3747
3748  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3749  // independently on 128-bit lanes.
3750  unsigned NumLanes = VT.getSizeInBits()/128;
3751  unsigned NumLaneElts = NumElts/NumLanes;
3752
3753  for (unsigned l = 0; l != NumLanes; ++l) {
3754    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3755         i != (l+1)*NumLaneElts; i += 2, ++j) {
3756      int BitI  = Mask[i];
3757      int BitI1 = Mask[i+1];
3758      if (!isUndefOrEqual(BitI, j))
3759        return false;
3760      if (!isUndefOrEqual(BitI1, j))
3761        return false;
3762    }
3763  }
3764  return true;
3765}
3766
3767/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3768/// specifies a shuffle of elements that is suitable for input to MOVSS,
3769/// MOVSD, and MOVD, i.e. setting the lowest element.
3770static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
3771  if (VT.getVectorElementType().getSizeInBits() < 32)
3772    return false;
3773  if (!VT.is128BitVector())
3774    return false;
3775
3776  unsigned NumElts = VT.getVectorNumElements();
3777
3778  if (!isUndefOrEqual(Mask[0], NumElts))
3779    return false;
3780
3781  for (unsigned i = 1; i != NumElts; ++i)
3782    if (!isUndefOrEqual(Mask[i], i))
3783      return false;
3784
3785  return true;
3786}
3787
3788/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
3789/// as permutations between 128-bit chunks or halves. As an example: this
3790/// shuffle bellow:
3791///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
3792/// The first half comes from the second half of V1 and the second half from the
3793/// the second half of V2.
3794static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3795  if (!HasFp256 || !VT.is256BitVector())
3796    return false;
3797
3798  // The shuffle result is divided into half A and half B. In total the two
3799  // sources have 4 halves, namely: C, D, E, F. The final values of A and
3800  // B must come from C, D, E or F.
3801  unsigned HalfSize = VT.getVectorNumElements()/2;
3802  bool MatchA = false, MatchB = false;
3803
3804  // Check if A comes from one of C, D, E, F.
3805  for (unsigned Half = 0; Half != 4; ++Half) {
3806    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
3807      MatchA = true;
3808      break;
3809    }
3810  }
3811
3812  // Check if B comes from one of C, D, E, F.
3813  for (unsigned Half = 0; Half != 4; ++Half) {
3814    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
3815      MatchB = true;
3816      break;
3817    }
3818  }
3819
3820  return MatchA && MatchB;
3821}
3822
3823/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
3824/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
3825static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
3826  EVT VT = SVOp->getValueType(0);
3827
3828  unsigned HalfSize = VT.getVectorNumElements()/2;
3829
3830  unsigned FstHalf = 0, SndHalf = 0;
3831  for (unsigned i = 0; i < HalfSize; ++i) {
3832    if (SVOp->getMaskElt(i) > 0) {
3833      FstHalf = SVOp->getMaskElt(i)/HalfSize;
3834      break;
3835    }
3836  }
3837  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
3838    if (SVOp->getMaskElt(i) > 0) {
3839      SndHalf = SVOp->getMaskElt(i)/HalfSize;
3840      break;
3841    }
3842  }
3843
3844  return (FstHalf | (SndHalf << 4));
3845}
3846
3847/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
3848/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
3849/// Note that VPERMIL mask matching is different depending whether theunderlying
3850/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3851/// to the same elements of the low, but to the higher half of the source.
3852/// In VPERMILPD the two lanes could be shuffled independently of each other
3853/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
3854static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3855  if (!HasFp256)
3856    return false;
3857
3858  unsigned NumElts = VT.getVectorNumElements();
3859  // Only match 256-bit with 32/64-bit types
3860  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
3861    return false;
3862
3863  unsigned NumLanes = VT.getSizeInBits()/128;
3864  unsigned LaneSize = NumElts/NumLanes;
3865  for (unsigned l = 0; l != NumElts; l += LaneSize) {
3866    for (unsigned i = 0; i != LaneSize; ++i) {
3867      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
3868        return false;
3869      if (NumElts != 8 || l == 0)
3870        continue;
3871      // VPERMILPS handling
3872      if (Mask[i] < 0)
3873        continue;
3874      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
3875        return false;
3876    }
3877  }
3878
3879  return true;
3880}
3881
3882/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
3883/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3884/// element of vector 2 and the other elements to come from vector 1 in order.
3885static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
3886                               bool V2IsSplat = false, bool V2IsUndef = false) {
3887  if (!VT.is128BitVector())
3888    return false;
3889
3890  unsigned NumOps = VT.getVectorNumElements();
3891  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3892    return false;
3893
3894  if (!isUndefOrEqual(Mask[0], 0))
3895    return false;
3896
3897  for (unsigned i = 1; i != NumOps; ++i)
3898    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3899          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3900          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3901      return false;
3902
3903  return true;
3904}
3905
3906/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3907/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3908/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
3909static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
3910                           const X86Subtarget *Subtarget) {
3911  if (!Subtarget->hasSSE3())
3912    return false;
3913
3914  unsigned NumElems = VT.getVectorNumElements();
3915
3916  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3917      (VT.getSizeInBits() == 256 && NumElems != 8))
3918    return false;
3919
3920  // "i+1" is the value the indexed mask element must have
3921  for (unsigned i = 0; i != NumElems; i += 2)
3922    if (!isUndefOrEqual(Mask[i], i+1) ||
3923        !isUndefOrEqual(Mask[i+1], i+1))
3924      return false;
3925
3926  return true;
3927}
3928
3929/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3930/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3931/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
3932static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
3933                           const X86Subtarget *Subtarget) {
3934  if (!Subtarget->hasSSE3())
3935    return false;
3936
3937  unsigned NumElems = VT.getVectorNumElements();
3938
3939  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3940      (VT.getSizeInBits() == 256 && NumElems != 8))
3941    return false;
3942
3943  // "i" is the value the indexed mask element must have
3944  for (unsigned i = 0; i != NumElems; i += 2)
3945    if (!isUndefOrEqual(Mask[i], i) ||
3946        !isUndefOrEqual(Mask[i+1], i))
3947      return false;
3948
3949  return true;
3950}
3951
3952/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
3953/// specifies a shuffle of elements that is suitable for input to 256-bit
3954/// version of MOVDDUP.
3955static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
3956  if (!HasFp256 || !VT.is256BitVector())
3957    return false;
3958
3959  unsigned NumElts = VT.getVectorNumElements();
3960  if (NumElts != 4)
3961    return false;
3962
3963  for (unsigned i = 0; i != NumElts/2; ++i)
3964    if (!isUndefOrEqual(Mask[i], 0))
3965      return false;
3966  for (unsigned i = NumElts/2; i != NumElts; ++i)
3967    if (!isUndefOrEqual(Mask[i], NumElts/2))
3968      return false;
3969  return true;
3970}
3971
3972/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3973/// specifies a shuffle of elements that is suitable for input to 128-bit
3974/// version of MOVDDUP.
3975static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
3976  if (!VT.is128BitVector())
3977    return false;
3978
3979  unsigned e = VT.getVectorNumElements() / 2;
3980  for (unsigned i = 0; i != e; ++i)
3981    if (!isUndefOrEqual(Mask[i], i))
3982      return false;
3983  for (unsigned i = 0; i != e; ++i)
3984    if (!isUndefOrEqual(Mask[e+i], i))
3985      return false;
3986  return true;
3987}
3988
3989/// isVEXTRACTF128Index - Return true if the specified
3990/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
3991/// suitable for input to VEXTRACTF128.
3992bool X86::isVEXTRACTF128Index(SDNode *N) {
3993  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3994    return false;
3995
3996  // The index should be aligned on a 128-bit boundary.
3997  uint64_t Index =
3998    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3999
4000  unsigned VL = N->getValueType(0).getVectorNumElements();
4001  unsigned VBits = N->getValueType(0).getSizeInBits();
4002  unsigned ElSize = VBits / VL;
4003  bool Result = (Index * ElSize) % 128 == 0;
4004
4005  return Result;
4006}
4007
4008/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
4009/// operand specifies a subvector insert that is suitable for input to
4010/// VINSERTF128.
4011bool X86::isVINSERTF128Index(SDNode *N) {
4012  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4013    return false;
4014
4015  // The index should be aligned on a 128-bit boundary.
4016  uint64_t Index =
4017    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4018
4019  unsigned VL = N->getValueType(0).getVectorNumElements();
4020  unsigned VBits = N->getValueType(0).getSizeInBits();
4021  unsigned ElSize = VBits / VL;
4022  bool Result = (Index * ElSize) % 128 == 0;
4023
4024  return Result;
4025}
4026
4027/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4028/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4029/// Handles 128-bit and 256-bit.
4030static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4031  EVT VT = N->getValueType(0);
4032
4033  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4034         "Unsupported vector type for PSHUF/SHUFP");
4035
4036  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4037  // independently on 128-bit lanes.
4038  unsigned NumElts = VT.getVectorNumElements();
4039  unsigned NumLanes = VT.getSizeInBits()/128;
4040  unsigned NumLaneElts = NumElts/NumLanes;
4041
4042  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
4043         "Only supports 2 or 4 elements per lane");
4044
4045  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
4046  unsigned Mask = 0;
4047  for (unsigned i = 0; i != NumElts; ++i) {
4048    int Elt = N->getMaskElt(i);
4049    if (Elt < 0) continue;
4050    Elt &= NumLaneElts - 1;
4051    unsigned ShAmt = (i << Shift) % 8;
4052    Mask |= Elt << ShAmt;
4053  }
4054
4055  return Mask;
4056}
4057
4058/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4059/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4060static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4061  EVT VT = N->getValueType(0);
4062
4063  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4064         "Unsupported vector type for PSHUFHW");
4065
4066  unsigned NumElts = VT.getVectorNumElements();
4067
4068  unsigned Mask = 0;
4069  for (unsigned l = 0; l != NumElts; l += 8) {
4070    // 8 nodes per lane, but we only care about the last 4.
4071    for (unsigned i = 0; i < 4; ++i) {
4072      int Elt = N->getMaskElt(l+i+4);
4073      if (Elt < 0) continue;
4074      Elt &= 0x3; // only 2-bits.
4075      Mask |= Elt << (i * 2);
4076    }
4077  }
4078
4079  return Mask;
4080}
4081
4082/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4083/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4084static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4085  EVT VT = N->getValueType(0);
4086
4087  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4088         "Unsupported vector type for PSHUFHW");
4089
4090  unsigned NumElts = VT.getVectorNumElements();
4091
4092  unsigned Mask = 0;
4093  for (unsigned l = 0; l != NumElts; l += 8) {
4094    // 8 nodes per lane, but we only care about the first 4.
4095    for (unsigned i = 0; i < 4; ++i) {
4096      int Elt = N->getMaskElt(l+i);
4097      if (Elt < 0) continue;
4098      Elt &= 0x3; // only 2-bits
4099      Mask |= Elt << (i * 2);
4100    }
4101  }
4102
4103  return Mask;
4104}
4105
4106/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
4107/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
4108static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4109  EVT VT = SVOp->getValueType(0);
4110  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
4111
4112  unsigned NumElts = VT.getVectorNumElements();
4113  unsigned NumLanes = VT.getSizeInBits()/128;
4114  unsigned NumLaneElts = NumElts/NumLanes;
4115
4116  int Val = 0;
4117  unsigned i;
4118  for (i = 0; i != NumElts; ++i) {
4119    Val = SVOp->getMaskElt(i);
4120    if (Val >= 0)
4121      break;
4122  }
4123  if (Val >= (int)NumElts)
4124    Val -= NumElts - NumLaneElts;
4125
4126  assert(Val - i > 0 && "PALIGNR imm should be positive");
4127  return (Val - i) * EltSize;
4128}
4129
4130/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
4131/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4132/// instructions.
4133unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
4134  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4135    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
4136
4137  uint64_t Index =
4138    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4139
4140  EVT VecVT = N->getOperand(0).getValueType();
4141  EVT ElVT = VecVT.getVectorElementType();
4142
4143  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4144  return Index / NumElemsPerChunk;
4145}
4146
4147/// getInsertVINSERTF128Immediate - Return the appropriate immediate
4148/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4149/// instructions.
4150unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
4151  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4152    llvm_unreachable("Illegal insert subvector for VINSERTF128");
4153
4154  uint64_t Index =
4155    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4156
4157  EVT VecVT = N->getValueType(0);
4158  EVT ElVT = VecVT.getVectorElementType();
4159
4160  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4161  return Index / NumElemsPerChunk;
4162}
4163
4164/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
4165/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
4166/// Handles 256-bit.
4167static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
4168  EVT VT = N->getValueType(0);
4169
4170  unsigned NumElts = VT.getVectorNumElements();
4171
4172  assert((VT.is256BitVector() && NumElts == 4) &&
4173         "Unsupported vector type for VPERMQ/VPERMPD");
4174
4175  unsigned Mask = 0;
4176  for (unsigned i = 0; i != NumElts; ++i) {
4177    int Elt = N->getMaskElt(i);
4178    if (Elt < 0)
4179      continue;
4180    Mask |= Elt << (i*2);
4181  }
4182
4183  return Mask;
4184}
4185/// isZeroNode - Returns true if Elt is a constant zero or a floating point
4186/// constant +0.0.
4187bool X86::isZeroNode(SDValue Elt) {
4188  return ((isa<ConstantSDNode>(Elt) &&
4189           cast<ConstantSDNode>(Elt)->isNullValue()) ||
4190          (isa<ConstantFPSDNode>(Elt) &&
4191           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
4192}
4193
4194/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4195/// their permute mask.
4196static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
4197                                    SelectionDAG &DAG) {
4198  EVT VT = SVOp->getValueType(0);
4199  unsigned NumElems = VT.getVectorNumElements();
4200  SmallVector<int, 8> MaskVec;
4201
4202  for (unsigned i = 0; i != NumElems; ++i) {
4203    int Idx = SVOp->getMaskElt(i);
4204    if (Idx >= 0) {
4205      if (Idx < (int)NumElems)
4206        Idx += NumElems;
4207      else
4208        Idx -= NumElems;
4209    }
4210    MaskVec.push_back(Idx);
4211  }
4212  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
4213                              SVOp->getOperand(0), &MaskVec[0]);
4214}
4215
4216/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4217/// match movhlps. The lower half elements should come from upper half of
4218/// V1 (and in order), and the upper half elements should come from the upper
4219/// half of V2 (and in order).
4220static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
4221  if (!VT.is128BitVector())
4222    return false;
4223  if (VT.getVectorNumElements() != 4)
4224    return false;
4225  for (unsigned i = 0, e = 2; i != e; ++i)
4226    if (!isUndefOrEqual(Mask[i], i+2))
4227      return false;
4228  for (unsigned i = 2; i != 4; ++i)
4229    if (!isUndefOrEqual(Mask[i], i+4))
4230      return false;
4231  return true;
4232}
4233
4234/// isScalarLoadToVector - Returns true if the node is a scalar load that
4235/// is promoted to a vector. It also returns the LoadSDNode by reference if
4236/// required.
4237static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4238  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4239    return false;
4240  N = N->getOperand(0).getNode();
4241  if (!ISD::isNON_EXTLoad(N))
4242    return false;
4243  if (LD)
4244    *LD = cast<LoadSDNode>(N);
4245  return true;
4246}
4247
4248// Test whether the given value is a vector value which will be legalized
4249// into a load.
4250static bool WillBeConstantPoolLoad(SDNode *N) {
4251  if (N->getOpcode() != ISD::BUILD_VECTOR)
4252    return false;
4253
4254  // Check for any non-constant elements.
4255  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4256    switch (N->getOperand(i).getNode()->getOpcode()) {
4257    case ISD::UNDEF:
4258    case ISD::ConstantFP:
4259    case ISD::Constant:
4260      break;
4261    default:
4262      return false;
4263    }
4264
4265  // Vectors of all-zeros and all-ones are materialized with special
4266  // instructions rather than being loaded.
4267  return !ISD::isBuildVectorAllZeros(N) &&
4268         !ISD::isBuildVectorAllOnes(N);
4269}
4270
4271/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4272/// match movlp{s|d}. The lower half elements should come from lower half of
4273/// V1 (and in order), and the upper half elements should come from the upper
4274/// half of V2 (and in order). And since V1 will become the source of the
4275/// MOVLP, it must be either a vector load or a scalar load to vector.
4276static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
4277                               ArrayRef<int> Mask, EVT VT) {
4278  if (!VT.is128BitVector())
4279    return false;
4280
4281  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4282    return false;
4283  // Is V2 is a vector load, don't do this transformation. We will try to use
4284  // load folding shufps op.
4285  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
4286    return false;
4287
4288  unsigned NumElems = VT.getVectorNumElements();
4289
4290  if (NumElems != 2 && NumElems != 4)
4291    return false;
4292  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4293    if (!isUndefOrEqual(Mask[i], i))
4294      return false;
4295  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4296    if (!isUndefOrEqual(Mask[i], i+NumElems))
4297      return false;
4298  return true;
4299}
4300
4301/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4302/// all the same.
4303static bool isSplatVector(SDNode *N) {
4304  if (N->getOpcode() != ISD::BUILD_VECTOR)
4305    return false;
4306
4307  SDValue SplatValue = N->getOperand(0);
4308  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4309    if (N->getOperand(i) != SplatValue)
4310      return false;
4311  return true;
4312}
4313
4314/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4315/// to an zero vector.
4316/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4317static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4318  SDValue V1 = N->getOperand(0);
4319  SDValue V2 = N->getOperand(1);
4320  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4321  for (unsigned i = 0; i != NumElems; ++i) {
4322    int Idx = N->getMaskElt(i);
4323    if (Idx >= (int)NumElems) {
4324      unsigned Opc = V2.getOpcode();
4325      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4326        continue;
4327      if (Opc != ISD::BUILD_VECTOR ||
4328          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4329        return false;
4330    } else if (Idx >= 0) {
4331      unsigned Opc = V1.getOpcode();
4332      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4333        continue;
4334      if (Opc != ISD::BUILD_VECTOR ||
4335          !X86::isZeroNode(V1.getOperand(Idx)))
4336        return false;
4337    }
4338  }
4339  return true;
4340}
4341
4342/// getZeroVector - Returns a vector of specified type with all zero elements.
4343///
4344static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4345                             SelectionDAG &DAG, DebugLoc dl) {
4346  assert(VT.isVector() && "Expected a vector type");
4347  unsigned Size = VT.getSizeInBits();
4348
4349  // Always build SSE zero vectors as <4 x i32> bitcasted
4350  // to their dest type. This ensures they get CSE'd.
4351  SDValue Vec;
4352  if (Size == 128) {  // SSE
4353    if (Subtarget->hasSSE2()) {  // SSE2
4354      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4355      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4356    } else { // SSE1
4357      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4358      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4359    }
4360  } else if (Size == 256) { // AVX
4361    if (Subtarget->hasInt256()) { // AVX2
4362      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4363      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4364      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4365    } else {
4366      // 256-bit logic and arithmetic instructions in AVX are all
4367      // floating-point, no support for integer ops. Emit fp zeroed vectors.
4368      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4369      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4370      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
4371    }
4372  } else
4373    llvm_unreachable("Unexpected vector type");
4374
4375  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4376}
4377
4378/// getOnesVector - Returns a vector of specified type with all bits set.
4379/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4380/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4381/// Then bitcast to their original type, ensuring they get CSE'd.
4382static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
4383                             DebugLoc dl) {
4384  assert(VT.isVector() && "Expected a vector type");
4385  unsigned Size = VT.getSizeInBits();
4386
4387  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4388  SDValue Vec;
4389  if (Size == 256) {
4390    if (HasInt256) { // AVX2
4391      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4392      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4393    } else { // AVX
4394      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4395      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4396    }
4397  } else if (Size == 128) {
4398    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4399  } else
4400    llvm_unreachable("Unexpected vector type");
4401
4402  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4403}
4404
4405/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4406/// that point to V2 points to its first element.
4407static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4408  for (unsigned i = 0; i != NumElems; ++i) {
4409    if (Mask[i] > (int)NumElems) {
4410      Mask[i] = NumElems;
4411    }
4412  }
4413}
4414
4415/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4416/// operation of specified width.
4417static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4418                       SDValue V2) {
4419  unsigned NumElems = VT.getVectorNumElements();
4420  SmallVector<int, 8> Mask;
4421  Mask.push_back(NumElems);
4422  for (unsigned i = 1; i != NumElems; ++i)
4423    Mask.push_back(i);
4424  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4425}
4426
4427/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4428static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4429                          SDValue V2) {
4430  unsigned NumElems = VT.getVectorNumElements();
4431  SmallVector<int, 8> Mask;
4432  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4433    Mask.push_back(i);
4434    Mask.push_back(i + NumElems);
4435  }
4436  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4437}
4438
4439/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4440static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4441                          SDValue V2) {
4442  unsigned NumElems = VT.getVectorNumElements();
4443  SmallVector<int, 8> Mask;
4444  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4445    Mask.push_back(i + Half);
4446    Mask.push_back(i + NumElems + Half);
4447  }
4448  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4449}
4450
4451// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4452// a generic shuffle instruction because the target has no such instructions.
4453// Generate shuffles which repeat i16 and i8 several times until they can be
4454// represented by v4f32 and then be manipulated by target suported shuffles.
4455static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4456  EVT VT = V.getValueType();
4457  int NumElems = VT.getVectorNumElements();
4458  DebugLoc dl = V.getDebugLoc();
4459
4460  while (NumElems > 4) {
4461    if (EltNo < NumElems/2) {
4462      V = getUnpackl(DAG, dl, VT, V, V);
4463    } else {
4464      V = getUnpackh(DAG, dl, VT, V, V);
4465      EltNo -= NumElems/2;
4466    }
4467    NumElems >>= 1;
4468  }
4469  return V;
4470}
4471
4472/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4473static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4474  EVT VT = V.getValueType();
4475  DebugLoc dl = V.getDebugLoc();
4476  unsigned Size = VT.getSizeInBits();
4477
4478  if (Size == 128) {
4479    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4480    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4481    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4482                             &SplatMask[0]);
4483  } else if (Size == 256) {
4484    // To use VPERMILPS to splat scalars, the second half of indicies must
4485    // refer to the higher part, which is a duplication of the lower one,
4486    // because VPERMILPS can only handle in-lane permutations.
4487    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4488                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4489
4490    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4491    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4492                             &SplatMask[0]);
4493  } else
4494    llvm_unreachable("Vector size not supported");
4495
4496  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4497}
4498
4499/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4500static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4501  EVT SrcVT = SV->getValueType(0);
4502  SDValue V1 = SV->getOperand(0);
4503  DebugLoc dl = SV->getDebugLoc();
4504
4505  int EltNo = SV->getSplatIndex();
4506  int NumElems = SrcVT.getVectorNumElements();
4507  unsigned Size = SrcVT.getSizeInBits();
4508
4509  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
4510          "Unknown how to promote splat for type");
4511
4512  // Extract the 128-bit part containing the splat element and update
4513  // the splat element index when it refers to the higher register.
4514  if (Size == 256) {
4515    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4516    if (EltNo >= NumElems/2)
4517      EltNo -= NumElems/2;
4518  }
4519
4520  // All i16 and i8 vector types can't be used directly by a generic shuffle
4521  // instruction because the target has no such instruction. Generate shuffles
4522  // which repeat i16 and i8 several times until they fit in i32, and then can
4523  // be manipulated by target suported shuffles.
4524  EVT EltVT = SrcVT.getVectorElementType();
4525  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4526    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4527
4528  // Recreate the 256-bit vector and place the same 128-bit vector
4529  // into the low and high part. This is necessary because we want
4530  // to use VPERM* to shuffle the vectors
4531  if (Size == 256) {
4532    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4533  }
4534
4535  return getLegalSplat(DAG, V1, EltNo);
4536}
4537
4538/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4539/// vector of zero or undef vector.  This produces a shuffle where the low
4540/// element of V2 is swizzled into the zero/undef vector, landing at element
4541/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4542static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4543                                           bool IsZero,
4544                                           const X86Subtarget *Subtarget,
4545                                           SelectionDAG &DAG) {
4546  EVT VT = V2.getValueType();
4547  SDValue V1 = IsZero
4548    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
4549  unsigned NumElems = VT.getVectorNumElements();
4550  SmallVector<int, 16> MaskVec;
4551  for (unsigned i = 0; i != NumElems; ++i)
4552    // If this is the insertion idx, put the low elt of V2 here.
4553    MaskVec.push_back(i == Idx ? NumElems : i);
4554  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
4555}
4556
4557/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4558/// target specific opcode. Returns true if the Mask could be calculated.
4559/// Sets IsUnary to true if only uses one source.
4560static bool getTargetShuffleMask(SDNode *N, MVT VT,
4561                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4562  unsigned NumElems = VT.getVectorNumElements();
4563  SDValue ImmN;
4564
4565  IsUnary = false;
4566  switch(N->getOpcode()) {
4567  case X86ISD::SHUFP:
4568    ImmN = N->getOperand(N->getNumOperands()-1);
4569    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4570    break;
4571  case X86ISD::UNPCKH:
4572    DecodeUNPCKHMask(VT, Mask);
4573    break;
4574  case X86ISD::UNPCKL:
4575    DecodeUNPCKLMask(VT, Mask);
4576    break;
4577  case X86ISD::MOVHLPS:
4578    DecodeMOVHLPSMask(NumElems, Mask);
4579    break;
4580  case X86ISD::MOVLHPS:
4581    DecodeMOVLHPSMask(NumElems, Mask);
4582    break;
4583  case X86ISD::PSHUFD:
4584  case X86ISD::VPERMILP:
4585    ImmN = N->getOperand(N->getNumOperands()-1);
4586    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4587    IsUnary = true;
4588    break;
4589  case X86ISD::PSHUFHW:
4590    ImmN = N->getOperand(N->getNumOperands()-1);
4591    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4592    IsUnary = true;
4593    break;
4594  case X86ISD::PSHUFLW:
4595    ImmN = N->getOperand(N->getNumOperands()-1);
4596    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4597    IsUnary = true;
4598    break;
4599  case X86ISD::VPERMI:
4600    ImmN = N->getOperand(N->getNumOperands()-1);
4601    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4602    IsUnary = true;
4603    break;
4604  case X86ISD::MOVSS:
4605  case X86ISD::MOVSD: {
4606    // The index 0 always comes from the first element of the second source,
4607    // this is why MOVSS and MOVSD are used in the first place. The other
4608    // elements come from the other positions of the first source vector
4609    Mask.push_back(NumElems);
4610    for (unsigned i = 1; i != NumElems; ++i) {
4611      Mask.push_back(i);
4612    }
4613    break;
4614  }
4615  case X86ISD::VPERM2X128:
4616    ImmN = N->getOperand(N->getNumOperands()-1);
4617    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4618    if (Mask.empty()) return false;
4619    break;
4620  case X86ISD::MOVDDUP:
4621  case X86ISD::MOVLHPD:
4622  case X86ISD::MOVLPD:
4623  case X86ISD::MOVLPS:
4624  case X86ISD::MOVSHDUP:
4625  case X86ISD::MOVSLDUP:
4626  case X86ISD::PALIGN:
4627    // Not yet implemented
4628    return false;
4629  default: llvm_unreachable("unknown target shuffle node");
4630  }
4631
4632  return true;
4633}
4634
4635/// getShuffleScalarElt - Returns the scalar element that will make up the ith
4636/// element of the result of the vector shuffle.
4637static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
4638                                   unsigned Depth) {
4639  if (Depth == 6)
4640    return SDValue();  // Limit search depth.
4641
4642  SDValue V = SDValue(N, 0);
4643  EVT VT = V.getValueType();
4644  unsigned Opcode = V.getOpcode();
4645
4646  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4647  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4648    int Elt = SV->getMaskElt(Index);
4649
4650    if (Elt < 0)
4651      return DAG.getUNDEF(VT.getVectorElementType());
4652
4653    unsigned NumElems = VT.getVectorNumElements();
4654    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
4655                                         : SV->getOperand(1);
4656    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
4657  }
4658
4659  // Recurse into target specific vector shuffles to find scalars.
4660  if (isTargetShuffle(Opcode)) {
4661    MVT ShufVT = V.getValueType().getSimpleVT();
4662    unsigned NumElems = ShufVT.getVectorNumElements();
4663    SmallVector<int, 16> ShuffleMask;
4664    bool IsUnary;
4665
4666    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
4667      return SDValue();
4668
4669    int Elt = ShuffleMask[Index];
4670    if (Elt < 0)
4671      return DAG.getUNDEF(ShufVT.getVectorElementType());
4672
4673    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
4674                                         : N->getOperand(1);
4675    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
4676                               Depth+1);
4677  }
4678
4679  // Actual nodes that may contain scalar elements
4680  if (Opcode == ISD::BITCAST) {
4681    V = V.getOperand(0);
4682    EVT SrcVT = V.getValueType();
4683    unsigned NumElems = VT.getVectorNumElements();
4684
4685    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4686      return SDValue();
4687  }
4688
4689  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4690    return (Index == 0) ? V.getOperand(0)
4691                        : DAG.getUNDEF(VT.getVectorElementType());
4692
4693  if (V.getOpcode() == ISD::BUILD_VECTOR)
4694    return V.getOperand(Index);
4695
4696  return SDValue();
4697}
4698
4699/// getNumOfConsecutiveZeros - Return the number of elements of a vector
4700/// shuffle operation which come from a consecutively from a zero. The
4701/// search can start in two different directions, from left or right.
4702static
4703unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
4704                                  bool ZerosFromLeft, SelectionDAG &DAG) {
4705  unsigned i;
4706  for (i = 0; i != NumElems; ++i) {
4707    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
4708    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
4709    if (!(Elt.getNode() &&
4710         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
4711      break;
4712  }
4713
4714  return i;
4715}
4716
4717/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
4718/// correspond consecutively to elements from one of the vector operands,
4719/// starting from its index OpIdx. Also tell OpNum which source vector operand.
4720static
4721bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
4722                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
4723                              unsigned NumElems, unsigned &OpNum) {
4724  bool SeenV1 = false;
4725  bool SeenV2 = false;
4726
4727  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
4728    int Idx = SVOp->getMaskElt(i);
4729    // Ignore undef indicies
4730    if (Idx < 0)
4731      continue;
4732
4733    if (Idx < (int)NumElems)
4734      SeenV1 = true;
4735    else
4736      SeenV2 = true;
4737
4738    // Only accept consecutive elements from the same vector
4739    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
4740      return false;
4741  }
4742
4743  OpNum = SeenV1 ? 0 : 1;
4744  return true;
4745}
4746
4747/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
4748/// logical left shift of a vector.
4749static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4750                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4751  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4752  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4753              false /* check zeros from right */, DAG);
4754  unsigned OpSrc;
4755
4756  if (!NumZeros)
4757    return false;
4758
4759  // Considering the elements in the mask that are not consecutive zeros,
4760  // check if they consecutively come from only one of the source vectors.
4761  //
4762  //               V1 = {X, A, B, C}     0
4763  //                         \  \  \    /
4764  //   vector_shuffle V1, V2 <1, 2, 3, X>
4765  //
4766  if (!isShuffleMaskConsecutive(SVOp,
4767            0,                   // Mask Start Index
4768            NumElems-NumZeros,   // Mask End Index(exclusive)
4769            NumZeros,            // Where to start looking in the src vector
4770            NumElems,            // Number of elements in vector
4771            OpSrc))              // Which source operand ?
4772    return false;
4773
4774  isLeft = false;
4775  ShAmt = NumZeros;
4776  ShVal = SVOp->getOperand(OpSrc);
4777  return true;
4778}
4779
4780/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
4781/// logical left shift of a vector.
4782static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4783                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4784  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4785  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4786              true /* check zeros from left */, DAG);
4787  unsigned OpSrc;
4788
4789  if (!NumZeros)
4790    return false;
4791
4792  // Considering the elements in the mask that are not consecutive zeros,
4793  // check if they consecutively come from only one of the source vectors.
4794  //
4795  //                           0    { A, B, X, X } = V2
4796  //                          / \    /  /
4797  //   vector_shuffle V1, V2 <X, X, 4, 5>
4798  //
4799  if (!isShuffleMaskConsecutive(SVOp,
4800            NumZeros,     // Mask Start Index
4801            NumElems,     // Mask End Index(exclusive)
4802            0,            // Where to start looking in the src vector
4803            NumElems,     // Number of elements in vector
4804            OpSrc))       // Which source operand ?
4805    return false;
4806
4807  isLeft = true;
4808  ShAmt = NumZeros;
4809  ShVal = SVOp->getOperand(OpSrc);
4810  return true;
4811}
4812
4813/// isVectorShift - Returns true if the shuffle can be implemented as a
4814/// logical left or right shift of a vector.
4815static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4816                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4817  // Although the logic below support any bitwidth size, there are no
4818  // shift instructions which handle more than 128-bit vectors.
4819  if (!SVOp->getValueType(0).is128BitVector())
4820    return false;
4821
4822  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
4823      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
4824    return true;
4825
4826  return false;
4827}
4828
4829/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4830///
4831static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4832                                       unsigned NumNonZero, unsigned NumZero,
4833                                       SelectionDAG &DAG,
4834                                       const X86Subtarget* Subtarget,
4835                                       const TargetLowering &TLI) {
4836  if (NumNonZero > 8)
4837    return SDValue();
4838
4839  DebugLoc dl = Op.getDebugLoc();
4840  SDValue V(0, 0);
4841  bool First = true;
4842  for (unsigned i = 0; i < 16; ++i) {
4843    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4844    if (ThisIsNonZero && First) {
4845      if (NumZero)
4846        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4847      else
4848        V = DAG.getUNDEF(MVT::v8i16);
4849      First = false;
4850    }
4851
4852    if ((i & 1) != 0) {
4853      SDValue ThisElt(0, 0), LastElt(0, 0);
4854      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4855      if (LastIsNonZero) {
4856        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4857                              MVT::i16, Op.getOperand(i-1));
4858      }
4859      if (ThisIsNonZero) {
4860        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4861        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4862                              ThisElt, DAG.getConstant(8, MVT::i8));
4863        if (LastIsNonZero)
4864          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4865      } else
4866        ThisElt = LastElt;
4867
4868      if (ThisElt.getNode())
4869        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4870                        DAG.getIntPtrConstant(i/2));
4871    }
4872  }
4873
4874  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
4875}
4876
4877/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4878///
4879static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4880                                     unsigned NumNonZero, unsigned NumZero,
4881                                     SelectionDAG &DAG,
4882                                     const X86Subtarget* Subtarget,
4883                                     const TargetLowering &TLI) {
4884  if (NumNonZero > 4)
4885    return SDValue();
4886
4887  DebugLoc dl = Op.getDebugLoc();
4888  SDValue V(0, 0);
4889  bool First = true;
4890  for (unsigned i = 0; i < 8; ++i) {
4891    bool isNonZero = (NonZeros & (1 << i)) != 0;
4892    if (isNonZero) {
4893      if (First) {
4894        if (NumZero)
4895          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4896        else
4897          V = DAG.getUNDEF(MVT::v8i16);
4898        First = false;
4899      }
4900      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4901                      MVT::v8i16, V, Op.getOperand(i),
4902                      DAG.getIntPtrConstant(i));
4903    }
4904  }
4905
4906  return V;
4907}
4908
4909/// getVShift - Return a vector logical shift node.
4910///
4911static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4912                         unsigned NumBits, SelectionDAG &DAG,
4913                         const TargetLowering &TLI, DebugLoc dl) {
4914  assert(VT.is128BitVector() && "Unknown type for VShift");
4915  EVT ShVT = MVT::v2i64;
4916  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
4917  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
4918  return DAG.getNode(ISD::BITCAST, dl, VT,
4919                     DAG.getNode(Opc, dl, ShVT, SrcOp,
4920                             DAG.getConstant(NumBits,
4921                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
4922}
4923
4924SDValue
4925X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
4926                                          SelectionDAG &DAG) const {
4927
4928  // Check if the scalar load can be widened into a vector load. And if
4929  // the address is "base + cst" see if the cst can be "absorbed" into
4930  // the shuffle mask.
4931  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4932    SDValue Ptr = LD->getBasePtr();
4933    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4934      return SDValue();
4935    EVT PVT = LD->getValueType(0);
4936    if (PVT != MVT::i32 && PVT != MVT::f32)
4937      return SDValue();
4938
4939    int FI = -1;
4940    int64_t Offset = 0;
4941    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4942      FI = FINode->getIndex();
4943      Offset = 0;
4944    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
4945               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4946      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4947      Offset = Ptr.getConstantOperandVal(1);
4948      Ptr = Ptr.getOperand(0);
4949    } else {
4950      return SDValue();
4951    }
4952
4953    // FIXME: 256-bit vector instructions don't require a strict alignment,
4954    // improve this code to support it better.
4955    unsigned RequiredAlign = VT.getSizeInBits()/8;
4956    SDValue Chain = LD->getChain();
4957    // Make sure the stack object alignment is at least 16 or 32.
4958    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4959    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
4960      if (MFI->isFixedObjectIndex(FI)) {
4961        // Can't change the alignment. FIXME: It's possible to compute
4962        // the exact stack offset and reference FI + adjust offset instead.
4963        // If someone *really* cares about this. That's the way to implement it.
4964        return SDValue();
4965      } else {
4966        MFI->setObjectAlignment(FI, RequiredAlign);
4967      }
4968    }
4969
4970    // (Offset % 16 or 32) must be multiple of 4. Then address is then
4971    // Ptr + (Offset & ~15).
4972    if (Offset < 0)
4973      return SDValue();
4974    if ((Offset % RequiredAlign) & 3)
4975      return SDValue();
4976    int64_t StartOffset = Offset & ~(RequiredAlign-1);
4977    if (StartOffset)
4978      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
4979                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
4980
4981    int EltNo = (Offset - StartOffset) >> 2;
4982    unsigned NumElems = VT.getVectorNumElements();
4983
4984    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
4985    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
4986                             LD->getPointerInfo().getWithOffset(StartOffset),
4987                             false, false, false, 0);
4988
4989    SmallVector<int, 8> Mask;
4990    for (unsigned i = 0; i != NumElems; ++i)
4991      Mask.push_back(EltNo);
4992
4993    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
4994  }
4995
4996  return SDValue();
4997}
4998
4999/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
5000/// vector of type 'VT', see if the elements can be replaced by a single large
5001/// load which has the same value as a build_vector whose operands are 'elts'.
5002///
5003/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
5004///
5005/// FIXME: we'd also like to handle the case where the last elements are zero
5006/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
5007/// There's even a handy isZeroNode for that purpose.
5008static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
5009                                        DebugLoc &DL, SelectionDAG &DAG) {
5010  EVT EltVT = VT.getVectorElementType();
5011  unsigned NumElems = Elts.size();
5012
5013  LoadSDNode *LDBase = NULL;
5014  unsigned LastLoadedElt = -1U;
5015
5016  // For each element in the initializer, see if we've found a load or an undef.
5017  // If we don't find an initial load element, or later load elements are
5018  // non-consecutive, bail out.
5019  for (unsigned i = 0; i < NumElems; ++i) {
5020    SDValue Elt = Elts[i];
5021
5022    if (!Elt.getNode() ||
5023        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
5024      return SDValue();
5025    if (!LDBase) {
5026      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
5027        return SDValue();
5028      LDBase = cast<LoadSDNode>(Elt.getNode());
5029      LastLoadedElt = i;
5030      continue;
5031    }
5032    if (Elt.getOpcode() == ISD::UNDEF)
5033      continue;
5034
5035    LoadSDNode *LD = cast<LoadSDNode>(Elt);
5036    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
5037      return SDValue();
5038    LastLoadedElt = i;
5039  }
5040
5041  // If we have found an entire vector of loads and undefs, then return a large
5042  // load of the entire vector width starting at the base pointer.  If we found
5043  // consecutive loads for the low half, generate a vzext_load node.
5044  if (LastLoadedElt == NumElems - 1) {
5045    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
5046      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5047                         LDBase->getPointerInfo(),
5048                         LDBase->isVolatile(), LDBase->isNonTemporal(),
5049                         LDBase->isInvariant(), 0);
5050    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5051                       LDBase->getPointerInfo(),
5052                       LDBase->isVolatile(), LDBase->isNonTemporal(),
5053                       LDBase->isInvariant(), LDBase->getAlignment());
5054  }
5055  if (NumElems == 4 && LastLoadedElt == 1 &&
5056      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
5057    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
5058    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5059    SDValue ResNode =
5060        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
5061                                LDBase->getPointerInfo(),
5062                                LDBase->getAlignment(),
5063                                false/*isVolatile*/, true/*ReadMem*/,
5064                                false/*WriteMem*/);
5065
5066    // Make sure the newly-created LOAD is in the same position as LDBase in
5067    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
5068    // update uses of LDBase's output chain to use the TokenFactor.
5069    if (LDBase->hasAnyUseOfValue(1)) {
5070      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5071                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
5072      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5073      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5074                             SDValue(ResNode.getNode(), 1));
5075    }
5076
5077    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
5078  }
5079  return SDValue();
5080}
5081
5082/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
5083/// to generate a splat value for the following cases:
5084/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5085/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5086/// a scalar load, or a constant.
5087/// The VBROADCAST node is returned when a pattern is found,
5088/// or SDValue() otherwise.
5089SDValue
5090X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
5091  if (!Subtarget->hasFp256())
5092    return SDValue();
5093
5094  EVT VT = Op.getValueType();
5095  DebugLoc dl = Op.getDebugLoc();
5096
5097  assert((VT.is128BitVector() || VT.is256BitVector()) &&
5098         "Unsupported vector type for broadcast.");
5099
5100  SDValue Ld;
5101  bool ConstSplatVal;
5102
5103  switch (Op.getOpcode()) {
5104    default:
5105      // Unknown pattern found.
5106      return SDValue();
5107
5108    case ISD::BUILD_VECTOR: {
5109      // The BUILD_VECTOR node must be a splat.
5110      if (!isSplatVector(Op.getNode()))
5111        return SDValue();
5112
5113      Ld = Op.getOperand(0);
5114      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5115                     Ld.getOpcode() == ISD::ConstantFP);
5116
5117      // The suspected load node has several users. Make sure that all
5118      // of its users are from the BUILD_VECTOR node.
5119      // Constants may have multiple users.
5120      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
5121        return SDValue();
5122      break;
5123    }
5124
5125    case ISD::VECTOR_SHUFFLE: {
5126      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5127
5128      // Shuffles must have a splat mask where the first element is
5129      // broadcasted.
5130      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5131        return SDValue();
5132
5133      SDValue Sc = Op.getOperand(0);
5134      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5135          Sc.getOpcode() != ISD::BUILD_VECTOR) {
5136
5137        if (!Subtarget->hasInt256())
5138          return SDValue();
5139
5140        // Use the register form of the broadcast instruction available on AVX2.
5141        if (VT.is256BitVector())
5142          Sc = Extract128BitVector(Sc, 0, DAG, dl);
5143        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5144      }
5145
5146      Ld = Sc.getOperand(0);
5147      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5148                       Ld.getOpcode() == ISD::ConstantFP);
5149
5150      // The scalar_to_vector node and the suspected
5151      // load node must have exactly one user.
5152      // Constants may have multiple users.
5153      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
5154        return SDValue();
5155      break;
5156    }
5157  }
5158
5159  bool Is256 = VT.is256BitVector();
5160
5161  // Handle the broadcasting a single constant scalar from the constant pool
5162  // into a vector. On Sandybridge it is still better to load a constant vector
5163  // from the constant pool and not to broadcast it from a scalar.
5164  if (ConstSplatVal && Subtarget->hasInt256()) {
5165    EVT CVT = Ld.getValueType();
5166    assert(!CVT.isVector() && "Must not broadcast a vector type");
5167    unsigned ScalarSize = CVT.getSizeInBits();
5168
5169    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
5170      const Constant *C = 0;
5171      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5172        C = CI->getConstantIntValue();
5173      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5174        C = CF->getConstantFPValue();
5175
5176      assert(C && "Invalid constant type");
5177
5178      SDValue CP = DAG.getConstantPool(C, getPointerTy());
5179      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5180      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5181                       MachinePointerInfo::getConstantPool(),
5182                       false, false, false, Alignment);
5183
5184      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5185    }
5186  }
5187
5188  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5189  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5190
5191  // Handle AVX2 in-register broadcasts.
5192  if (!IsLoad && Subtarget->hasInt256() &&
5193      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
5194    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5195
5196  // The scalar source must be a normal load.
5197  if (!IsLoad)
5198    return SDValue();
5199
5200  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
5201    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5202
5203  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5204  // double since there is no vbroadcastsd xmm
5205  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
5206    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5207      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5208  }
5209
5210  // Unsupported broadcast.
5211  return SDValue();
5212}
5213
5214SDValue
5215X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
5216  EVT VT = Op.getValueType();
5217
5218  // Skip if insert_vec_elt is not supported.
5219  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5220    return SDValue();
5221
5222  DebugLoc DL = Op.getDebugLoc();
5223  unsigned NumElems = Op.getNumOperands();
5224
5225  SDValue VecIn1;
5226  SDValue VecIn2;
5227  SmallVector<unsigned, 4> InsertIndices;
5228  SmallVector<int, 8> Mask(NumElems, -1);
5229
5230  for (unsigned i = 0; i != NumElems; ++i) {
5231    unsigned Opc = Op.getOperand(i).getOpcode();
5232
5233    if (Opc == ISD::UNDEF)
5234      continue;
5235
5236    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5237      // Quit if more than 1 elements need inserting.
5238      if (InsertIndices.size() > 1)
5239        return SDValue();
5240
5241      InsertIndices.push_back(i);
5242      continue;
5243    }
5244
5245    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5246    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5247
5248    // Quit if extracted from vector of different type.
5249    if (ExtractedFromVec.getValueType() != VT)
5250      return SDValue();
5251
5252    // Quit if non-constant index.
5253    if (!isa<ConstantSDNode>(ExtIdx))
5254      return SDValue();
5255
5256    if (VecIn1.getNode() == 0)
5257      VecIn1 = ExtractedFromVec;
5258    else if (VecIn1 != ExtractedFromVec) {
5259      if (VecIn2.getNode() == 0)
5260        VecIn2 = ExtractedFromVec;
5261      else if (VecIn2 != ExtractedFromVec)
5262        // Quit if more than 2 vectors to shuffle
5263        return SDValue();
5264    }
5265
5266    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5267
5268    if (ExtractedFromVec == VecIn1)
5269      Mask[i] = Idx;
5270    else if (ExtractedFromVec == VecIn2)
5271      Mask[i] = Idx + NumElems;
5272  }
5273
5274  if (VecIn1.getNode() == 0)
5275    return SDValue();
5276
5277  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5278  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
5279  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5280    unsigned Idx = InsertIndices[i];
5281    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
5282                     DAG.getIntPtrConstant(Idx));
5283  }
5284
5285  return NV;
5286}
5287
5288SDValue
5289X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5290  DebugLoc dl = Op.getDebugLoc();
5291
5292  EVT VT = Op.getValueType();
5293  EVT ExtVT = VT.getVectorElementType();
5294  unsigned NumElems = Op.getNumOperands();
5295
5296  // Vectors containing all zeros can be matched by pxor and xorps later
5297  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5298    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5299    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5300    if (VT == MVT::v4i32 || VT == MVT::v8i32)
5301      return Op;
5302
5303    return getZeroVector(VT, Subtarget, DAG, dl);
5304  }
5305
5306  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5307  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5308  // vpcmpeqd on 256-bit vectors.
5309  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5310    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
5311      return Op;
5312
5313    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
5314  }
5315
5316  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
5317  if (Broadcast.getNode())
5318    return Broadcast;
5319
5320  unsigned EVTBits = ExtVT.getSizeInBits();
5321
5322  unsigned NumZero  = 0;
5323  unsigned NumNonZero = 0;
5324  unsigned NonZeros = 0;
5325  bool IsAllConstants = true;
5326  SmallSet<SDValue, 8> Values;
5327  for (unsigned i = 0; i < NumElems; ++i) {
5328    SDValue Elt = Op.getOperand(i);
5329    if (Elt.getOpcode() == ISD::UNDEF)
5330      continue;
5331    Values.insert(Elt);
5332    if (Elt.getOpcode() != ISD::Constant &&
5333        Elt.getOpcode() != ISD::ConstantFP)
5334      IsAllConstants = false;
5335    if (X86::isZeroNode(Elt))
5336      NumZero++;
5337    else {
5338      NonZeros |= (1 << i);
5339      NumNonZero++;
5340    }
5341  }
5342
5343  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5344  if (NumNonZero == 0)
5345    return DAG.getUNDEF(VT);
5346
5347  // Special case for single non-zero, non-undef, element.
5348  if (NumNonZero == 1) {
5349    unsigned Idx = CountTrailingZeros_32(NonZeros);
5350    SDValue Item = Op.getOperand(Idx);
5351
5352    // If this is an insertion of an i64 value on x86-32, and if the top bits of
5353    // the value are obviously zero, truncate the value to i32 and do the
5354    // insertion that way.  Only do this if the value is non-constant or if the
5355    // value is a constant being inserted into element 0.  It is cheaper to do
5356    // a constant pool load than it is to do a movd + shuffle.
5357    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5358        (!IsAllConstants || Idx == 0)) {
5359      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5360        // Handle SSE only.
5361        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5362        EVT VecVT = MVT::v4i32;
5363        unsigned VecElts = 4;
5364
5365        // Truncate the value (which may itself be a constant) to i32, and
5366        // convert it to a vector with movd (S2V+shuffle to zero extend).
5367        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5368        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5369        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5370
5371        // Now we have our 32-bit value zero extended in the low element of
5372        // a vector.  If Idx != 0, swizzle it into place.
5373        if (Idx != 0) {
5374          SmallVector<int, 4> Mask;
5375          Mask.push_back(Idx);
5376          for (unsigned i = 1; i != VecElts; ++i)
5377            Mask.push_back(i);
5378          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5379                                      &Mask[0]);
5380        }
5381        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5382      }
5383    }
5384
5385    // If we have a constant or non-constant insertion into the low element of
5386    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5387    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5388    // depending on what the source datatype is.
5389    if (Idx == 0) {
5390      if (NumZero == 0)
5391        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5392
5393      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5394          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5395        if (VT.is256BitVector()) {
5396          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5397          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5398                             Item, DAG.getIntPtrConstant(0));
5399        }
5400        assert(VT.is128BitVector() && "Expected an SSE value type!");
5401        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5402        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5403        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5404      }
5405
5406      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5407        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5408        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5409        if (VT.is256BitVector()) {
5410          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5411          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5412        } else {
5413          assert(VT.is128BitVector() && "Expected an SSE value type!");
5414          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5415        }
5416        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5417      }
5418    }
5419
5420    // Is it a vector logical left shift?
5421    if (NumElems == 2 && Idx == 1 &&
5422        X86::isZeroNode(Op.getOperand(0)) &&
5423        !X86::isZeroNode(Op.getOperand(1))) {
5424      unsigned NumBits = VT.getSizeInBits();
5425      return getVShift(true, VT,
5426                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5427                                   VT, Op.getOperand(1)),
5428                       NumBits/2, DAG, *this, dl);
5429    }
5430
5431    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5432      return SDValue();
5433
5434    // Otherwise, if this is a vector with i32 or f32 elements, and the element
5435    // is a non-constant being inserted into an element other than the low one,
5436    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5437    // movd/movss) to move this into the low element, then shuffle it into
5438    // place.
5439    if (EVTBits == 32) {
5440      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5441
5442      // Turn it into a shuffle of zero and zero-extended scalar to vector.
5443      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5444      SmallVector<int, 8> MaskVec;
5445      for (unsigned i = 0; i != NumElems; ++i)
5446        MaskVec.push_back(i == Idx ? 0 : 1);
5447      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5448    }
5449  }
5450
5451  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5452  if (Values.size() == 1) {
5453    if (EVTBits == 32) {
5454      // Instead of a shuffle like this:
5455      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5456      // Check if it's possible to issue this instead.
5457      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5458      unsigned Idx = CountTrailingZeros_32(NonZeros);
5459      SDValue Item = Op.getOperand(Idx);
5460      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5461        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5462    }
5463    return SDValue();
5464  }
5465
5466  // A vector full of immediates; various special cases are already
5467  // handled, so this is best done with a single constant-pool load.
5468  if (IsAllConstants)
5469    return SDValue();
5470
5471  // For AVX-length vectors, build the individual 128-bit pieces and use
5472  // shuffles to put them in place.
5473  if (VT.is256BitVector()) {
5474    SmallVector<SDValue, 32> V;
5475    for (unsigned i = 0; i != NumElems; ++i)
5476      V.push_back(Op.getOperand(i));
5477
5478    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5479
5480    // Build both the lower and upper subvector.
5481    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
5482    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
5483                                NumElems/2);
5484
5485    // Recreate the wider vector with the lower and upper part.
5486    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5487  }
5488
5489  // Let legalizer expand 2-wide build_vectors.
5490  if (EVTBits == 64) {
5491    if (NumNonZero == 1) {
5492      // One half is zero or undef.
5493      unsigned Idx = CountTrailingZeros_32(NonZeros);
5494      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
5495                                 Op.getOperand(Idx));
5496      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
5497    }
5498    return SDValue();
5499  }
5500
5501  // If element VT is < 32 bits, convert it to inserts into a zero vector.
5502  if (EVTBits == 8 && NumElems == 16) {
5503    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
5504                                        Subtarget, *this);
5505    if (V.getNode()) return V;
5506  }
5507
5508  if (EVTBits == 16 && NumElems == 8) {
5509    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
5510                                      Subtarget, *this);
5511    if (V.getNode()) return V;
5512  }
5513
5514  // If element VT is == 32 bits, turn it into a number of shuffles.
5515  SmallVector<SDValue, 8> V(NumElems);
5516  if (NumElems == 4 && NumZero > 0) {
5517    for (unsigned i = 0; i < 4; ++i) {
5518      bool isZero = !(NonZeros & (1 << i));
5519      if (isZero)
5520        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
5521      else
5522        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5523    }
5524
5525    for (unsigned i = 0; i < 2; ++i) {
5526      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5527        default: break;
5528        case 0:
5529          V[i] = V[i*2];  // Must be a zero vector.
5530          break;
5531        case 1:
5532          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5533          break;
5534        case 2:
5535          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5536          break;
5537        case 3:
5538          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5539          break;
5540      }
5541    }
5542
5543    bool Reverse1 = (NonZeros & 0x3) == 2;
5544    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5545    int MaskVec[] = {
5546      Reverse1 ? 1 : 0,
5547      Reverse1 ? 0 : 1,
5548      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
5549      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
5550    };
5551    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
5552  }
5553
5554  if (Values.size() > 1 && VT.is128BitVector()) {
5555    // Check for a build vector of consecutive loads.
5556    for (unsigned i = 0; i < NumElems; ++i)
5557      V[i] = Op.getOperand(i);
5558
5559    // Check for elements which are consecutive loads.
5560    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
5561    if (LD.getNode())
5562      return LD;
5563
5564    // Check for a build vector from mostly shuffle plus few inserting.
5565    SDValue Sh = buildFromShuffleMostly(Op, DAG);
5566    if (Sh.getNode())
5567      return Sh;
5568
5569    // For SSE 4.1, use insertps to put the high elements into the low element.
5570    if (getSubtarget()->hasSSE41()) {
5571      SDValue Result;
5572      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
5573        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
5574      else
5575        Result = DAG.getUNDEF(VT);
5576
5577      for (unsigned i = 1; i < NumElems; ++i) {
5578        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
5579        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
5580                             Op.getOperand(i), DAG.getIntPtrConstant(i));
5581      }
5582      return Result;
5583    }
5584
5585    // Otherwise, expand into a number of unpckl*, start by extending each of
5586    // our (non-undef) elements to the full vector width with the element in the
5587    // bottom slot of the vector (which generates no code for SSE).
5588    for (unsigned i = 0; i < NumElems; ++i) {
5589      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
5590        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5591      else
5592        V[i] = DAG.getUNDEF(VT);
5593    }
5594
5595    // Next, we iteratively mix elements, e.g. for v4f32:
5596    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
5597    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
5598    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
5599    unsigned EltStride = NumElems >> 1;
5600    while (EltStride != 0) {
5601      for (unsigned i = 0; i < EltStride; ++i) {
5602        // If V[i+EltStride] is undef and this is the first round of mixing,
5603        // then it is safe to just drop this shuffle: V[i] is already in the
5604        // right place, the one element (since it's the first round) being
5605        // inserted as undef can be dropped.  This isn't safe for successive
5606        // rounds because they will permute elements within both vectors.
5607        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
5608            EltStride == NumElems/2)
5609          continue;
5610
5611        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
5612      }
5613      EltStride >>= 1;
5614    }
5615    return V[0];
5616  }
5617  return SDValue();
5618}
5619
5620// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
5621// to create 256-bit vectors from two other 128-bit ones.
5622static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5623  DebugLoc dl = Op.getDebugLoc();
5624  EVT ResVT = Op.getValueType();
5625
5626  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
5627
5628  SDValue V1 = Op.getOperand(0);
5629  SDValue V2 = Op.getOperand(1);
5630  unsigned NumElems = ResVT.getVectorNumElements();
5631
5632  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
5633}
5634
5635static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5636  assert(Op.getNumOperands() == 2);
5637
5638  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
5639  // from two other 128-bit ones.
5640  return LowerAVXCONCAT_VECTORS(Op, DAG);
5641}
5642
5643// Try to lower a shuffle node into a simple blend instruction.
5644static SDValue
5645LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
5646                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
5647  SDValue V1 = SVOp->getOperand(0);
5648  SDValue V2 = SVOp->getOperand(1);
5649  DebugLoc dl = SVOp->getDebugLoc();
5650  EVT VT = SVOp->getValueType(0);
5651  EVT EltVT = VT.getVectorElementType();
5652  unsigned NumElems = VT.getVectorNumElements();
5653
5654  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
5655    return SDValue();
5656  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
5657    return SDValue();
5658
5659  // Check the mask for BLEND and build the value.
5660  unsigned MaskValue = 0;
5661  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
5662  unsigned NumLanes = (NumElems-1)/8 + 1;
5663  unsigned NumElemsInLane = NumElems / NumLanes;
5664
5665  // Blend for v16i16 should be symetric for the both lanes.
5666  for (unsigned i = 0; i < NumElemsInLane; ++i) {
5667
5668    int SndLaneEltIdx = (NumLanes == 2) ?
5669      SVOp->getMaskElt(i + NumElemsInLane) : -1;
5670    int EltIdx = SVOp->getMaskElt(i);
5671
5672    if ((EltIdx == -1 || EltIdx == (int)i) &&
5673        (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
5674      continue;
5675
5676    if (((unsigned)EltIdx == (i + NumElems)) &&
5677        (SndLaneEltIdx == -1 ||
5678         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
5679      MaskValue |= (1<<i);
5680    else
5681      return SDValue();
5682  }
5683
5684  // Convert i32 vectors to floating point if it is not AVX2.
5685  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
5686  EVT BlendVT = VT;
5687  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
5688    BlendVT = EVT::getVectorVT(*DAG.getContext(),
5689                              EVT::getFloatingPointVT(EltVT.getSizeInBits()),
5690                              NumElems);
5691    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
5692    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
5693  }
5694
5695  SDValue Ret =  DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
5696                             DAG.getConstant(MaskValue, MVT::i32));
5697  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
5698}
5699
5700// v8i16 shuffles - Prefer shuffles in the following order:
5701// 1. [all]   pshuflw, pshufhw, optional move
5702// 2. [ssse3] 1 x pshufb
5703// 3. [ssse3] 2 x pshufb + 1 x por
5704// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
5705static SDValue
5706LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
5707                         SelectionDAG &DAG) {
5708  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5709  SDValue V1 = SVOp->getOperand(0);
5710  SDValue V2 = SVOp->getOperand(1);
5711  DebugLoc dl = SVOp->getDebugLoc();
5712  SmallVector<int, 8> MaskVals;
5713
5714  // Determine if more than 1 of the words in each of the low and high quadwords
5715  // of the result come from the same quadword of one of the two inputs.  Undef
5716  // mask values count as coming from any quadword, for better codegen.
5717  unsigned LoQuad[] = { 0, 0, 0, 0 };
5718  unsigned HiQuad[] = { 0, 0, 0, 0 };
5719  std::bitset<4> InputQuads;
5720  for (unsigned i = 0; i < 8; ++i) {
5721    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
5722    int EltIdx = SVOp->getMaskElt(i);
5723    MaskVals.push_back(EltIdx);
5724    if (EltIdx < 0) {
5725      ++Quad[0];
5726      ++Quad[1];
5727      ++Quad[2];
5728      ++Quad[3];
5729      continue;
5730    }
5731    ++Quad[EltIdx / 4];
5732    InputQuads.set(EltIdx / 4);
5733  }
5734
5735  int BestLoQuad = -1;
5736  unsigned MaxQuad = 1;
5737  for (unsigned i = 0; i < 4; ++i) {
5738    if (LoQuad[i] > MaxQuad) {
5739      BestLoQuad = i;
5740      MaxQuad = LoQuad[i];
5741    }
5742  }
5743
5744  int BestHiQuad = -1;
5745  MaxQuad = 1;
5746  for (unsigned i = 0; i < 4; ++i) {
5747    if (HiQuad[i] > MaxQuad) {
5748      BestHiQuad = i;
5749      MaxQuad = HiQuad[i];
5750    }
5751  }
5752
5753  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
5754  // of the two input vectors, shuffle them into one input vector so only a
5755  // single pshufb instruction is necessary. If There are more than 2 input
5756  // quads, disable the next transformation since it does not help SSSE3.
5757  bool V1Used = InputQuads[0] || InputQuads[1];
5758  bool V2Used = InputQuads[2] || InputQuads[3];
5759  if (Subtarget->hasSSSE3()) {
5760    if (InputQuads.count() == 2 && V1Used && V2Used) {
5761      BestLoQuad = InputQuads[0] ? 0 : 1;
5762      BestHiQuad = InputQuads[2] ? 2 : 3;
5763    }
5764    if (InputQuads.count() > 2) {
5765      BestLoQuad = -1;
5766      BestHiQuad = -1;
5767    }
5768  }
5769
5770  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
5771  // the shuffle mask.  If a quad is scored as -1, that means that it contains
5772  // words from all 4 input quadwords.
5773  SDValue NewV;
5774  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
5775    int MaskV[] = {
5776      BestLoQuad < 0 ? 0 : BestLoQuad,
5777      BestHiQuad < 0 ? 1 : BestHiQuad
5778    };
5779    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
5780                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
5781                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
5782    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
5783
5784    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
5785    // source words for the shuffle, to aid later transformations.
5786    bool AllWordsInNewV = true;
5787    bool InOrder[2] = { true, true };
5788    for (unsigned i = 0; i != 8; ++i) {
5789      int idx = MaskVals[i];
5790      if (idx != (int)i)
5791        InOrder[i/4] = false;
5792      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
5793        continue;
5794      AllWordsInNewV = false;
5795      break;
5796    }
5797
5798    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
5799    if (AllWordsInNewV) {
5800      for (int i = 0; i != 8; ++i) {
5801        int idx = MaskVals[i];
5802        if (idx < 0)
5803          continue;
5804        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
5805        if ((idx != i) && idx < 4)
5806          pshufhw = false;
5807        if ((idx != i) && idx > 3)
5808          pshuflw = false;
5809      }
5810      V1 = NewV;
5811      V2Used = false;
5812      BestLoQuad = 0;
5813      BestHiQuad = 1;
5814    }
5815
5816    // If we've eliminated the use of V2, and the new mask is a pshuflw or
5817    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
5818    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
5819      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
5820      unsigned TargetMask = 0;
5821      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
5822                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
5823      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5824      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
5825                             getShufflePSHUFLWImmediate(SVOp);
5826      V1 = NewV.getOperand(0);
5827      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
5828    }
5829  }
5830
5831  // If we have SSSE3, and all words of the result are from 1 input vector,
5832  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
5833  // is present, fall back to case 4.
5834  if (Subtarget->hasSSSE3()) {
5835    SmallVector<SDValue,16> pshufbMask;
5836
5837    // If we have elements from both input vectors, set the high bit of the
5838    // shuffle mask element to zero out elements that come from V2 in the V1
5839    // mask, and elements that come from V1 in the V2 mask, so that the two
5840    // results can be OR'd together.
5841    bool TwoInputs = V1Used && V2Used;
5842    for (unsigned i = 0; i != 8; ++i) {
5843      int EltIdx = MaskVals[i] * 2;
5844      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
5845      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
5846      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
5847      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5848    }
5849    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
5850    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5851                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5852                                 MVT::v16i8, &pshufbMask[0], 16));
5853    if (!TwoInputs)
5854      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5855
5856    // Calculate the shuffle mask for the second input, shuffle it, and
5857    // OR it with the first shuffled input.
5858    pshufbMask.clear();
5859    for (unsigned i = 0; i != 8; ++i) {
5860      int EltIdx = MaskVals[i] * 2;
5861      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
5862      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
5863      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
5864      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5865    }
5866    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
5867    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5868                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5869                                 MVT::v16i8, &pshufbMask[0], 16));
5870    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5871    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5872  }
5873
5874  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
5875  // and update MaskVals with new element order.
5876  std::bitset<8> InOrder;
5877  if (BestLoQuad >= 0) {
5878    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
5879    for (int i = 0; i != 4; ++i) {
5880      int idx = MaskVals[i];
5881      if (idx < 0) {
5882        InOrder.set(i);
5883      } else if ((idx / 4) == BestLoQuad) {
5884        MaskV[i] = idx & 3;
5885        InOrder.set(i);
5886      }
5887    }
5888    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5889                                &MaskV[0]);
5890
5891    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5892      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5893      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
5894                                  NewV.getOperand(0),
5895                                  getShufflePSHUFLWImmediate(SVOp), DAG);
5896    }
5897  }
5898
5899  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
5900  // and update MaskVals with the new element order.
5901  if (BestHiQuad >= 0) {
5902    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
5903    for (unsigned i = 4; i != 8; ++i) {
5904      int idx = MaskVals[i];
5905      if (idx < 0) {
5906        InOrder.set(i);
5907      } else if ((idx / 4) == BestHiQuad) {
5908        MaskV[i] = (idx & 3) + 4;
5909        InOrder.set(i);
5910      }
5911    }
5912    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5913                                &MaskV[0]);
5914
5915    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5916      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5917      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
5918                                  NewV.getOperand(0),
5919                                  getShufflePSHUFHWImmediate(SVOp), DAG);
5920    }
5921  }
5922
5923  // In case BestHi & BestLo were both -1, which means each quadword has a word
5924  // from each of the four input quadwords, calculate the InOrder bitvector now
5925  // before falling through to the insert/extract cleanup.
5926  if (BestLoQuad == -1 && BestHiQuad == -1) {
5927    NewV = V1;
5928    for (int i = 0; i != 8; ++i)
5929      if (MaskVals[i] < 0 || MaskVals[i] == i)
5930        InOrder.set(i);
5931  }
5932
5933  // The other elements are put in the right place using pextrw and pinsrw.
5934  for (unsigned i = 0; i != 8; ++i) {
5935    if (InOrder[i])
5936      continue;
5937    int EltIdx = MaskVals[i];
5938    if (EltIdx < 0)
5939      continue;
5940    SDValue ExtOp = (EltIdx < 8) ?
5941      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
5942                  DAG.getIntPtrConstant(EltIdx)) :
5943      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
5944                  DAG.getIntPtrConstant(EltIdx - 8));
5945    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
5946                       DAG.getIntPtrConstant(i));
5947  }
5948  return NewV;
5949}
5950
5951// v16i8 shuffles - Prefer shuffles in the following order:
5952// 1. [ssse3] 1 x pshufb
5953// 2. [ssse3] 2 x pshufb + 1 x por
5954// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
5955static
5956SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
5957                                 SelectionDAG &DAG,
5958                                 const X86TargetLowering &TLI) {
5959  SDValue V1 = SVOp->getOperand(0);
5960  SDValue V2 = SVOp->getOperand(1);
5961  DebugLoc dl = SVOp->getDebugLoc();
5962  ArrayRef<int> MaskVals = SVOp->getMask();
5963
5964  // If we have SSSE3, case 1 is generated when all result bytes come from
5965  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
5966  // present, fall back to case 3.
5967
5968  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
5969  if (TLI.getSubtarget()->hasSSSE3()) {
5970    SmallVector<SDValue,16> pshufbMask;
5971
5972    // If all result elements are from one input vector, then only translate
5973    // undef mask values to 0x80 (zero out result) in the pshufb mask.
5974    //
5975    // Otherwise, we have elements from both input vectors, and must zero out
5976    // elements that come from V2 in the first mask, and V1 in the second mask
5977    // so that we can OR them together.
5978    for (unsigned i = 0; i != 16; ++i) {
5979      int EltIdx = MaskVals[i];
5980      if (EltIdx < 0 || EltIdx >= 16)
5981        EltIdx = 0x80;
5982      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
5983    }
5984    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5985                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5986                                 MVT::v16i8, &pshufbMask[0], 16));
5987
5988    // As PSHUFB will zero elements with negative indices, it's safe to ignore
5989    // the 2nd operand if it's undefined or zero.
5990    if (V2.getOpcode() == ISD::UNDEF ||
5991        ISD::isBuildVectorAllZeros(V2.getNode()))
5992      return V1;
5993
5994    // Calculate the shuffle mask for the second input, shuffle it, and
5995    // OR it with the first shuffled input.
5996    pshufbMask.clear();
5997    for (unsigned i = 0; i != 16; ++i) {
5998      int EltIdx = MaskVals[i];
5999      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
6000      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6001    }
6002    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
6003                     DAG.getNode(ISD::BUILD_VECTOR, dl,
6004                                 MVT::v16i8, &pshufbMask[0], 16));
6005    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
6006  }
6007
6008  // No SSSE3 - Calculate in place words and then fix all out of place words
6009  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
6010  // the 16 different words that comprise the two doublequadword input vectors.
6011  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
6012  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
6013  SDValue NewV = V1;
6014  for (int i = 0; i != 8; ++i) {
6015    int Elt0 = MaskVals[i*2];
6016    int Elt1 = MaskVals[i*2+1];
6017
6018    // This word of the result is all undef, skip it.
6019    if (Elt0 < 0 && Elt1 < 0)
6020      continue;
6021
6022    // This word of the result is already in the correct place, skip it.
6023    if ((Elt0 == i*2) && (Elt1 == i*2+1))
6024      continue;
6025
6026    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
6027    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
6028    SDValue InsElt;
6029
6030    // If Elt0 and Elt1 are defined, are consecutive, and can be load
6031    // using a single extract together, load it and store it.
6032    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
6033      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6034                           DAG.getIntPtrConstant(Elt1 / 2));
6035      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6036                        DAG.getIntPtrConstant(i));
6037      continue;
6038    }
6039
6040    // If Elt1 is defined, extract it from the appropriate source.  If the
6041    // source byte is not also odd, shift the extracted word left 8 bits
6042    // otherwise clear the bottom 8 bits if we need to do an or.
6043    if (Elt1 >= 0) {
6044      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
6045                           DAG.getIntPtrConstant(Elt1 / 2));
6046      if ((Elt1 & 1) == 0)
6047        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
6048                             DAG.getConstant(8,
6049                                  TLI.getShiftAmountTy(InsElt.getValueType())));
6050      else if (Elt0 >= 0)
6051        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
6052                             DAG.getConstant(0xFF00, MVT::i16));
6053    }
6054    // If Elt0 is defined, extract it from the appropriate source.  If the
6055    // source byte is not also even, shift the extracted word right 8 bits. If
6056    // Elt1 was also defined, OR the extracted values together before
6057    // inserting them in the result.
6058    if (Elt0 >= 0) {
6059      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
6060                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
6061      if ((Elt0 & 1) != 0)
6062        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
6063                              DAG.getConstant(8,
6064                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
6065      else if (Elt1 >= 0)
6066        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
6067                             DAG.getConstant(0x00FF, MVT::i16));
6068      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
6069                         : InsElt0;
6070    }
6071    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6072                       DAG.getIntPtrConstant(i));
6073  }
6074  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
6075}
6076
6077// v32i8 shuffles - Translate to VPSHUFB if possible.
6078static
6079SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
6080                                 const X86Subtarget *Subtarget,
6081                                 SelectionDAG &DAG) {
6082  EVT VT = SVOp->getValueType(0);
6083  SDValue V1 = SVOp->getOperand(0);
6084  SDValue V2 = SVOp->getOperand(1);
6085  DebugLoc dl = SVOp->getDebugLoc();
6086  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
6087
6088  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6089  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
6090  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
6091
6092  // VPSHUFB may be generated if
6093  // (1) one of input vector is undefined or zeroinitializer.
6094  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
6095  // And (2) the mask indexes don't cross the 128-bit lane.
6096  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
6097      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
6098    return SDValue();
6099
6100  if (V1IsAllZero && !V2IsAllZero) {
6101    CommuteVectorShuffleMask(MaskVals, 32);
6102    V1 = V2;
6103  }
6104  SmallVector<SDValue, 32> pshufbMask;
6105  for (unsigned i = 0; i != 32; i++) {
6106    int EltIdx = MaskVals[i];
6107    if (EltIdx < 0 || EltIdx >= 32)
6108      EltIdx = 0x80;
6109    else {
6110      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
6111        // Cross lane is not allowed.
6112        return SDValue();
6113      EltIdx &= 0xf;
6114    }
6115    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6116  }
6117  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
6118                      DAG.getNode(ISD::BUILD_VECTOR, dl,
6119                                  MVT::v32i8, &pshufbMask[0], 32));
6120}
6121
6122/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
6123/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
6124/// done when every pair / quad of shuffle mask elements point to elements in
6125/// the right sequence. e.g.
6126/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
6127static
6128SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
6129                                 SelectionDAG &DAG, DebugLoc dl) {
6130  MVT VT = SVOp->getValueType(0).getSimpleVT();
6131  unsigned NumElems = VT.getVectorNumElements();
6132  MVT NewVT;
6133  unsigned Scale;
6134  switch (VT.SimpleTy) {
6135  default: llvm_unreachable("Unexpected!");
6136  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
6137  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
6138  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
6139  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
6140  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
6141  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
6142  }
6143
6144  SmallVector<int, 8> MaskVec;
6145  for (unsigned i = 0; i != NumElems; i += Scale) {
6146    int StartIdx = -1;
6147    for (unsigned j = 0; j != Scale; ++j) {
6148      int EltIdx = SVOp->getMaskElt(i+j);
6149      if (EltIdx < 0)
6150        continue;
6151      if (StartIdx < 0)
6152        StartIdx = (EltIdx / Scale);
6153      if (EltIdx != (int)(StartIdx*Scale + j))
6154        return SDValue();
6155    }
6156    MaskVec.push_back(StartIdx);
6157  }
6158
6159  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
6160  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
6161  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
6162}
6163
6164/// getVZextMovL - Return a zero-extending vector move low node.
6165///
6166static SDValue getVZextMovL(EVT VT, EVT OpVT,
6167                            SDValue SrcOp, SelectionDAG &DAG,
6168                            const X86Subtarget *Subtarget, DebugLoc dl) {
6169  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
6170    LoadSDNode *LD = NULL;
6171    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
6172      LD = dyn_cast<LoadSDNode>(SrcOp);
6173    if (!LD) {
6174      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
6175      // instead.
6176      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
6177      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
6178          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6179          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
6180          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
6181        // PR2108
6182        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
6183        return DAG.getNode(ISD::BITCAST, dl, VT,
6184                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6185                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6186                                                   OpVT,
6187                                                   SrcOp.getOperand(0)
6188                                                          .getOperand(0))));
6189      }
6190    }
6191  }
6192
6193  return DAG.getNode(ISD::BITCAST, dl, VT,
6194                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6195                                 DAG.getNode(ISD::BITCAST, dl,
6196                                             OpVT, SrcOp)));
6197}
6198
6199/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
6200/// which could not be matched by any known target speficic shuffle
6201static SDValue
6202LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6203
6204  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
6205  if (NewOp.getNode())
6206    return NewOp;
6207
6208  EVT VT = SVOp->getValueType(0);
6209
6210  unsigned NumElems = VT.getVectorNumElements();
6211  unsigned NumLaneElems = NumElems / 2;
6212
6213  DebugLoc dl = SVOp->getDebugLoc();
6214  MVT EltVT = VT.getVectorElementType().getSimpleVT();
6215  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
6216  SDValue Output[2];
6217
6218  SmallVector<int, 16> Mask;
6219  for (unsigned l = 0; l < 2; ++l) {
6220    // Build a shuffle mask for the output, discovering on the fly which
6221    // input vectors to use as shuffle operands (recorded in InputUsed).
6222    // If building a suitable shuffle vector proves too hard, then bail
6223    // out with UseBuildVector set.
6224    bool UseBuildVector = false;
6225    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
6226    unsigned LaneStart = l * NumLaneElems;
6227    for (unsigned i = 0; i != NumLaneElems; ++i) {
6228      // The mask element.  This indexes into the input.
6229      int Idx = SVOp->getMaskElt(i+LaneStart);
6230      if (Idx < 0) {
6231        // the mask element does not index into any input vector.
6232        Mask.push_back(-1);
6233        continue;
6234      }
6235
6236      // The input vector this mask element indexes into.
6237      int Input = Idx / NumLaneElems;
6238
6239      // Turn the index into an offset from the start of the input vector.
6240      Idx -= Input * NumLaneElems;
6241
6242      // Find or create a shuffle vector operand to hold this input.
6243      unsigned OpNo;
6244      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6245        if (InputUsed[OpNo] == Input)
6246          // This input vector is already an operand.
6247          break;
6248        if (InputUsed[OpNo] < 0) {
6249          // Create a new operand for this input vector.
6250          InputUsed[OpNo] = Input;
6251          break;
6252        }
6253      }
6254
6255      if (OpNo >= array_lengthof(InputUsed)) {
6256        // More than two input vectors used!  Give up on trying to create a
6257        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
6258        UseBuildVector = true;
6259        break;
6260      }
6261
6262      // Add the mask index for the new shuffle vector.
6263      Mask.push_back(Idx + OpNo * NumLaneElems);
6264    }
6265
6266    if (UseBuildVector) {
6267      SmallVector<SDValue, 16> SVOps;
6268      for (unsigned i = 0; i != NumLaneElems; ++i) {
6269        // The mask element.  This indexes into the input.
6270        int Idx = SVOp->getMaskElt(i+LaneStart);
6271        if (Idx < 0) {
6272          SVOps.push_back(DAG.getUNDEF(EltVT));
6273          continue;
6274        }
6275
6276        // The input vector this mask element indexes into.
6277        int Input = Idx / NumElems;
6278
6279        // Turn the index into an offset from the start of the input vector.
6280        Idx -= Input * NumElems;
6281
6282        // Extract the vector element by hand.
6283        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6284                                    SVOp->getOperand(Input),
6285                                    DAG.getIntPtrConstant(Idx)));
6286      }
6287
6288      // Construct the output using a BUILD_VECTOR.
6289      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
6290                              SVOps.size());
6291    } else if (InputUsed[0] < 0) {
6292      // No input vectors were used! The result is undefined.
6293      Output[l] = DAG.getUNDEF(NVT);
6294    } else {
6295      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6296                                        (InputUsed[0] % 2) * NumLaneElems,
6297                                        DAG, dl);
6298      // If only one input was used, use an undefined vector for the other.
6299      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6300        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6301                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6302      // At least one input vector was used. Create a new shuffle vector.
6303      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6304    }
6305
6306    Mask.clear();
6307  }
6308
6309  // Concatenate the result back
6310  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
6311}
6312
6313/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6314/// 4 elements, and match them with several different shuffle types.
6315static SDValue
6316LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6317  SDValue V1 = SVOp->getOperand(0);
6318  SDValue V2 = SVOp->getOperand(1);
6319  DebugLoc dl = SVOp->getDebugLoc();
6320  EVT VT = SVOp->getValueType(0);
6321
6322  assert(VT.is128BitVector() && "Unsupported vector size");
6323
6324  std::pair<int, int> Locs[4];
6325  int Mask1[] = { -1, -1, -1, -1 };
6326  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6327
6328  unsigned NumHi = 0;
6329  unsigned NumLo = 0;
6330  for (unsigned i = 0; i != 4; ++i) {
6331    int Idx = PermMask[i];
6332    if (Idx < 0) {
6333      Locs[i] = std::make_pair(-1, -1);
6334    } else {
6335      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6336      if (Idx < 4) {
6337        Locs[i] = std::make_pair(0, NumLo);
6338        Mask1[NumLo] = Idx;
6339        NumLo++;
6340      } else {
6341        Locs[i] = std::make_pair(1, NumHi);
6342        if (2+NumHi < 4)
6343          Mask1[2+NumHi] = Idx;
6344        NumHi++;
6345      }
6346    }
6347  }
6348
6349  if (NumLo <= 2 && NumHi <= 2) {
6350    // If no more than two elements come from either vector. This can be
6351    // implemented with two shuffles. First shuffle gather the elements.
6352    // The second shuffle, which takes the first shuffle as both of its
6353    // vector operands, put the elements into the right order.
6354    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6355
6356    int Mask2[] = { -1, -1, -1, -1 };
6357
6358    for (unsigned i = 0; i != 4; ++i)
6359      if (Locs[i].first != -1) {
6360        unsigned Idx = (i < 2) ? 0 : 4;
6361        Idx += Locs[i].first * 2 + Locs[i].second;
6362        Mask2[i] = Idx;
6363      }
6364
6365    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6366  }
6367
6368  if (NumLo == 3 || NumHi == 3) {
6369    // Otherwise, we must have three elements from one vector, call it X, and
6370    // one element from the other, call it Y.  First, use a shufps to build an
6371    // intermediate vector with the one element from Y and the element from X
6372    // that will be in the same half in the final destination (the indexes don't
6373    // matter). Then, use a shufps to build the final vector, taking the half
6374    // containing the element from Y from the intermediate, and the other half
6375    // from X.
6376    if (NumHi == 3) {
6377      // Normalize it so the 3 elements come from V1.
6378      CommuteVectorShuffleMask(PermMask, 4);
6379      std::swap(V1, V2);
6380    }
6381
6382    // Find the element from V2.
6383    unsigned HiIndex;
6384    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6385      int Val = PermMask[HiIndex];
6386      if (Val < 0)
6387        continue;
6388      if (Val >= 4)
6389        break;
6390    }
6391
6392    Mask1[0] = PermMask[HiIndex];
6393    Mask1[1] = -1;
6394    Mask1[2] = PermMask[HiIndex^1];
6395    Mask1[3] = -1;
6396    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6397
6398    if (HiIndex >= 2) {
6399      Mask1[0] = PermMask[0];
6400      Mask1[1] = PermMask[1];
6401      Mask1[2] = HiIndex & 1 ? 6 : 4;
6402      Mask1[3] = HiIndex & 1 ? 4 : 6;
6403      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6404    }
6405
6406    Mask1[0] = HiIndex & 1 ? 2 : 0;
6407    Mask1[1] = HiIndex & 1 ? 0 : 2;
6408    Mask1[2] = PermMask[2];
6409    Mask1[3] = PermMask[3];
6410    if (Mask1[2] >= 0)
6411      Mask1[2] += 4;
6412    if (Mask1[3] >= 0)
6413      Mask1[3] += 4;
6414    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6415  }
6416
6417  // Break it into (shuffle shuffle_hi, shuffle_lo).
6418  int LoMask[] = { -1, -1, -1, -1 };
6419  int HiMask[] = { -1, -1, -1, -1 };
6420
6421  int *MaskPtr = LoMask;
6422  unsigned MaskIdx = 0;
6423  unsigned LoIdx = 0;
6424  unsigned HiIdx = 2;
6425  for (unsigned i = 0; i != 4; ++i) {
6426    if (i == 2) {
6427      MaskPtr = HiMask;
6428      MaskIdx = 1;
6429      LoIdx = 0;
6430      HiIdx = 2;
6431    }
6432    int Idx = PermMask[i];
6433    if (Idx < 0) {
6434      Locs[i] = std::make_pair(-1, -1);
6435    } else if (Idx < 4) {
6436      Locs[i] = std::make_pair(MaskIdx, LoIdx);
6437      MaskPtr[LoIdx] = Idx;
6438      LoIdx++;
6439    } else {
6440      Locs[i] = std::make_pair(MaskIdx, HiIdx);
6441      MaskPtr[HiIdx] = Idx;
6442      HiIdx++;
6443    }
6444  }
6445
6446  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6447  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6448  int MaskOps[] = { -1, -1, -1, -1 };
6449  for (unsigned i = 0; i != 4; ++i)
6450    if (Locs[i].first != -1)
6451      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6452  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6453}
6454
6455static bool MayFoldVectorLoad(SDValue V) {
6456  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6457    V = V.getOperand(0);
6458
6459  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6460    V = V.getOperand(0);
6461  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6462      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6463    // BUILD_VECTOR (load), undef
6464    V = V.getOperand(0);
6465
6466  return MayFoldLoad(V);
6467}
6468
6469static
6470SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
6471  EVT VT = Op.getValueType();
6472
6473  // Canonizalize to v2f64.
6474  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
6475  return DAG.getNode(ISD::BITCAST, dl, VT,
6476                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
6477                                          V1, DAG));
6478}
6479
6480static
6481SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
6482                        bool HasSSE2) {
6483  SDValue V1 = Op.getOperand(0);
6484  SDValue V2 = Op.getOperand(1);
6485  EVT VT = Op.getValueType();
6486
6487  assert(VT != MVT::v2i64 && "unsupported shuffle type");
6488
6489  if (HasSSE2 && VT == MVT::v2f64)
6490    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
6491
6492  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
6493  return DAG.getNode(ISD::BITCAST, dl, VT,
6494                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
6495                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
6496                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
6497}
6498
6499static
6500SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
6501  SDValue V1 = Op.getOperand(0);
6502  SDValue V2 = Op.getOperand(1);
6503  EVT VT = Op.getValueType();
6504
6505  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
6506         "unsupported shuffle type");
6507
6508  if (V2.getOpcode() == ISD::UNDEF)
6509    V2 = V1;
6510
6511  // v4i32 or v4f32
6512  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
6513}
6514
6515static
6516SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
6517  SDValue V1 = Op.getOperand(0);
6518  SDValue V2 = Op.getOperand(1);
6519  EVT VT = Op.getValueType();
6520  unsigned NumElems = VT.getVectorNumElements();
6521
6522  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
6523  // operand of these instructions is only memory, so check if there's a
6524  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
6525  // same masks.
6526  bool CanFoldLoad = false;
6527
6528  // Trivial case, when V2 comes from a load.
6529  if (MayFoldVectorLoad(V2))
6530    CanFoldLoad = true;
6531
6532  // When V1 is a load, it can be folded later into a store in isel, example:
6533  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
6534  //    turns into:
6535  //  (MOVLPSmr addr:$src1, VR128:$src2)
6536  // So, recognize this potential and also use MOVLPS or MOVLPD
6537  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
6538    CanFoldLoad = true;
6539
6540  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6541  if (CanFoldLoad) {
6542    if (HasSSE2 && NumElems == 2)
6543      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
6544
6545    if (NumElems == 4)
6546      // If we don't care about the second element, proceed to use movss.
6547      if (SVOp->getMaskElt(1) != -1)
6548        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
6549  }
6550
6551  // movl and movlp will both match v2i64, but v2i64 is never matched by
6552  // movl earlier because we make it strict to avoid messing with the movlp load
6553  // folding logic (see the code above getMOVLP call). Match it here then,
6554  // this is horrible, but will stay like this until we move all shuffle
6555  // matching to x86 specific nodes. Note that for the 1st condition all
6556  // types are matched with movsd.
6557  if (HasSSE2) {
6558    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
6559    // as to remove this logic from here, as much as possible
6560    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
6561      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6562    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6563  }
6564
6565  assert(VT != MVT::v4i32 && "unsupported shuffle type");
6566
6567  // Invert the operand order and use SHUFPS to match it.
6568  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
6569                              getShuffleSHUFImmediate(SVOp), DAG);
6570}
6571
6572// Reduce a vector shuffle to zext.
6573SDValue
6574X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
6575  // PMOVZX is only available from SSE41.
6576  if (!Subtarget->hasSSE41())
6577    return SDValue();
6578
6579  EVT VT = Op.getValueType();
6580
6581  // Only AVX2 support 256-bit vector integer extending.
6582  if (!Subtarget->hasInt256() && VT.is256BitVector())
6583    return SDValue();
6584
6585  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6586  DebugLoc DL = Op.getDebugLoc();
6587  SDValue V1 = Op.getOperand(0);
6588  SDValue V2 = Op.getOperand(1);
6589  unsigned NumElems = VT.getVectorNumElements();
6590
6591  // Extending is an unary operation and the element type of the source vector
6592  // won't be equal to or larger than i64.
6593  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
6594      VT.getVectorElementType() == MVT::i64)
6595    return SDValue();
6596
6597  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
6598  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
6599  while ((1U << Shift) < NumElems) {
6600    if (SVOp->getMaskElt(1U << Shift) == 1)
6601      break;
6602    Shift += 1;
6603    // The maximal ratio is 8, i.e. from i8 to i64.
6604    if (Shift > 3)
6605      return SDValue();
6606  }
6607
6608  // Check the shuffle mask.
6609  unsigned Mask = (1U << Shift) - 1;
6610  for (unsigned i = 0; i != NumElems; ++i) {
6611    int EltIdx = SVOp->getMaskElt(i);
6612    if ((i & Mask) != 0 && EltIdx != -1)
6613      return SDValue();
6614    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
6615      return SDValue();
6616  }
6617
6618  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
6619  EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
6620  EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
6621
6622  if (!isTypeLegal(NVT))
6623    return SDValue();
6624
6625  // Simplify the operand as it's prepared to be fed into shuffle.
6626  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
6627  if (V1.getOpcode() == ISD::BITCAST &&
6628      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6629      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6630      V1.getOperand(0)
6631        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
6632    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
6633    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
6634    ConstantSDNode *CIdx =
6635      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
6636    // If it's foldable, i.e. normal load with single use, we will let code
6637    // selection to fold it. Otherwise, we will short the conversion sequence.
6638    if (CIdx && CIdx->getZExtValue() == 0 &&
6639        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
6640      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
6641  }
6642
6643  return DAG.getNode(ISD::BITCAST, DL, VT,
6644                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
6645}
6646
6647SDValue
6648X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
6649  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6650  EVT VT = Op.getValueType();
6651  DebugLoc dl = Op.getDebugLoc();
6652  SDValue V1 = Op.getOperand(0);
6653  SDValue V2 = Op.getOperand(1);
6654
6655  if (isZeroShuffle(SVOp))
6656    return getZeroVector(VT, Subtarget, DAG, dl);
6657
6658  // Handle splat operations
6659  if (SVOp->isSplat()) {
6660    unsigned NumElem = VT.getVectorNumElements();
6661    int Size = VT.getSizeInBits();
6662
6663    // Use vbroadcast whenever the splat comes from a foldable load
6664    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
6665    if (Broadcast.getNode())
6666      return Broadcast;
6667
6668    // Handle splats by matching through known shuffle masks
6669    if ((Size == 128 && NumElem <= 4) ||
6670        (Size == 256 && NumElem <= 8))
6671      return SDValue();
6672
6673    // All remaning splats are promoted to target supported vector shuffles.
6674    return PromoteSplat(SVOp, DAG);
6675  }
6676
6677  // Check integer expanding shuffles.
6678  SDValue NewOp = lowerVectorIntExtend(Op, DAG);
6679  if (NewOp.getNode())
6680    return NewOp;
6681
6682  // If the shuffle can be profitably rewritten as a narrower shuffle, then
6683  // do it!
6684  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
6685      VT == MVT::v16i16 || VT == MVT::v32i8) {
6686    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6687    if (NewOp.getNode())
6688      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
6689  } else if ((VT == MVT::v4i32 ||
6690             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
6691    // FIXME: Figure out a cleaner way to do this.
6692    // Try to make use of movq to zero out the top part.
6693    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
6694      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6695      if (NewOp.getNode()) {
6696        EVT NewVT = NewOp.getValueType();
6697        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
6698                               NewVT, true, false))
6699          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
6700                              DAG, Subtarget, dl);
6701      }
6702    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
6703      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6704      if (NewOp.getNode()) {
6705        EVT NewVT = NewOp.getValueType();
6706        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
6707          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
6708                              DAG, Subtarget, dl);
6709      }
6710    }
6711  }
6712  return SDValue();
6713}
6714
6715SDValue
6716X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
6717  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6718  SDValue V1 = Op.getOperand(0);
6719  SDValue V2 = Op.getOperand(1);
6720  EVT VT = Op.getValueType();
6721  DebugLoc dl = Op.getDebugLoc();
6722  unsigned NumElems = VT.getVectorNumElements();
6723  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
6724  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6725  bool V1IsSplat = false;
6726  bool V2IsSplat = false;
6727  bool HasSSE2 = Subtarget->hasSSE2();
6728  bool HasFp256    = Subtarget->hasFp256();
6729  bool HasInt256   = Subtarget->hasInt256();
6730  MachineFunction &MF = DAG.getMachineFunction();
6731  bool OptForSize = MF.getFunction()->getFnAttributes().
6732    hasAttribute(Attribute::OptimizeForSize);
6733
6734  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
6735
6736  if (V1IsUndef && V2IsUndef)
6737    return DAG.getUNDEF(VT);
6738
6739  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
6740
6741  // Vector shuffle lowering takes 3 steps:
6742  //
6743  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
6744  //    narrowing and commutation of operands should be handled.
6745  // 2) Matching of shuffles with known shuffle masks to x86 target specific
6746  //    shuffle nodes.
6747  // 3) Rewriting of unmatched masks into new generic shuffle operations,
6748  //    so the shuffle can be broken into other shuffles and the legalizer can
6749  //    try the lowering again.
6750  //
6751  // The general idea is that no vector_shuffle operation should be left to
6752  // be matched during isel, all of them must be converted to a target specific
6753  // node here.
6754
6755  // Normalize the input vectors. Here splats, zeroed vectors, profitable
6756  // narrowing and commutation of operands should be handled. The actual code
6757  // doesn't include all of those, work in progress...
6758  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
6759  if (NewOp.getNode())
6760    return NewOp;
6761
6762  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
6763
6764  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
6765  // unpckh_undef). Only use pshufd if speed is more important than size.
6766  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
6767    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6768  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
6769    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6770
6771  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
6772      V2IsUndef && MayFoldVectorLoad(V1))
6773    return getMOVDDup(Op, dl, V1, DAG);
6774
6775  if (isMOVHLPS_v_undef_Mask(M, VT))
6776    return getMOVHighToLow(Op, dl, DAG);
6777
6778  // Use to match splats
6779  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
6780      (VT == MVT::v2f64 || VT == MVT::v2i64))
6781    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6782
6783  if (isPSHUFDMask(M, VT)) {
6784    // The actual implementation will match the mask in the if above and then
6785    // during isel it can match several different instructions, not only pshufd
6786    // as its name says, sad but true, emulate the behavior for now...
6787    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
6788      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
6789
6790    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
6791
6792    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
6793      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
6794
6795    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
6796      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
6797                                  DAG);
6798
6799    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
6800                                TargetMask, DAG);
6801  }
6802
6803  // Check if this can be converted into a logical shift.
6804  bool isLeft = false;
6805  unsigned ShAmt = 0;
6806  SDValue ShVal;
6807  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
6808  if (isShift && ShVal.hasOneUse()) {
6809    // If the shifted value has multiple uses, it may be cheaper to use
6810    // v_set0 + movlhps or movhlps, etc.
6811    EVT EltVT = VT.getVectorElementType();
6812    ShAmt *= EltVT.getSizeInBits();
6813    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6814  }
6815
6816  if (isMOVLMask(M, VT)) {
6817    if (ISD::isBuildVectorAllZeros(V1.getNode()))
6818      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
6819    if (!isMOVLPMask(M, VT)) {
6820      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6821        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6822
6823      if (VT == MVT::v4i32 || VT == MVT::v4f32)
6824        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6825    }
6826  }
6827
6828  // FIXME: fold these into legal mask.
6829  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
6830    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
6831
6832  if (isMOVHLPSMask(M, VT))
6833    return getMOVHighToLow(Op, dl, DAG);
6834
6835  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
6836    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
6837
6838  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
6839    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
6840
6841  if (isMOVLPMask(M, VT))
6842    return getMOVLP(Op, dl, DAG, HasSSE2);
6843
6844  if (ShouldXformToMOVHLPS(M, VT) ||
6845      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
6846    return CommuteVectorShuffle(SVOp, DAG);
6847
6848  if (isShift) {
6849    // No better options. Use a vshldq / vsrldq.
6850    EVT EltVT = VT.getVectorElementType();
6851    ShAmt *= EltVT.getSizeInBits();
6852    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6853  }
6854
6855  bool Commuted = false;
6856  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
6857  // 1,1,1,1 -> v8i16 though.
6858  V1IsSplat = isSplatVector(V1.getNode());
6859  V2IsSplat = isSplatVector(V2.getNode());
6860
6861  // Canonicalize the splat or undef, if present, to be on the RHS.
6862  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
6863    CommuteVectorShuffleMask(M, NumElems);
6864    std::swap(V1, V2);
6865    std::swap(V1IsSplat, V2IsSplat);
6866    Commuted = true;
6867  }
6868
6869  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
6870    // Shuffling low element of v1 into undef, just return v1.
6871    if (V2IsUndef)
6872      return V1;
6873    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
6874    // the instruction selector will not match, so get a canonical MOVL with
6875    // swapped operands to undo the commute.
6876    return getMOVL(DAG, dl, VT, V2, V1);
6877  }
6878
6879  if (isUNPCKLMask(M, VT, HasInt256))
6880    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6881
6882  if (isUNPCKHMask(M, VT, HasInt256))
6883    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6884
6885  if (V2IsSplat) {
6886    // Normalize mask so all entries that point to V2 points to its first
6887    // element then try to match unpck{h|l} again. If match, return a
6888    // new vector_shuffle with the corrected mask.p
6889    SmallVector<int, 8> NewMask(M.begin(), M.end());
6890    NormalizeMask(NewMask, NumElems);
6891    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
6892      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6893    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
6894      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6895  }
6896
6897  if (Commuted) {
6898    // Commute is back and try unpck* again.
6899    // FIXME: this seems wrong.
6900    CommuteVectorShuffleMask(M, NumElems);
6901    std::swap(V1, V2);
6902    std::swap(V1IsSplat, V2IsSplat);
6903    Commuted = false;
6904
6905    if (isUNPCKLMask(M, VT, HasInt256))
6906      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6907
6908    if (isUNPCKHMask(M, VT, HasInt256))
6909      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6910  }
6911
6912  // Normalize the node to match x86 shuffle ops if needed
6913  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
6914    return CommuteVectorShuffle(SVOp, DAG);
6915
6916  // The checks below are all present in isShuffleMaskLegal, but they are
6917  // inlined here right now to enable us to directly emit target specific
6918  // nodes, and remove one by one until they don't return Op anymore.
6919
6920  if (isPALIGNRMask(M, VT, Subtarget))
6921    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
6922                                getShufflePALIGNRImmediate(SVOp),
6923                                DAG);
6924
6925  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
6926      SVOp->getSplatIndex() == 0 && V2IsUndef) {
6927    if (VT == MVT::v2f64 || VT == MVT::v2i64)
6928      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6929  }
6930
6931  if (isPSHUFHWMask(M, VT, HasInt256))
6932    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
6933                                getShufflePSHUFHWImmediate(SVOp),
6934                                DAG);
6935
6936  if (isPSHUFLWMask(M, VT, HasInt256))
6937    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
6938                                getShufflePSHUFLWImmediate(SVOp),
6939                                DAG);
6940
6941  if (isSHUFPMask(M, VT, HasFp256))
6942    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
6943                                getShuffleSHUFImmediate(SVOp), DAG);
6944
6945  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
6946    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6947  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
6948    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6949
6950  //===--------------------------------------------------------------------===//
6951  // Generate target specific nodes for 128 or 256-bit shuffles only
6952  // supported in the AVX instruction set.
6953  //
6954
6955  // Handle VMOVDDUPY permutations
6956  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
6957    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
6958
6959  // Handle VPERMILPS/D* permutations
6960  if (isVPERMILPMask(M, VT, HasFp256)) {
6961    if (HasInt256 && VT == MVT::v8i32)
6962      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
6963                                  getShuffleSHUFImmediate(SVOp), DAG);
6964    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
6965                                getShuffleSHUFImmediate(SVOp), DAG);
6966  }
6967
6968  // Handle VPERM2F128/VPERM2I128 permutations
6969  if (isVPERM2X128Mask(M, VT, HasFp256))
6970    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
6971                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
6972
6973  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
6974  if (BlendOp.getNode())
6975    return BlendOp;
6976
6977  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
6978    SmallVector<SDValue, 8> permclMask;
6979    for (unsigned i = 0; i != 8; ++i) {
6980      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
6981    }
6982    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
6983                               &permclMask[0], 8);
6984    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
6985    return DAG.getNode(X86ISD::VPERMV, dl, VT,
6986                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
6987  }
6988
6989  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
6990    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
6991                                getShuffleCLImmediate(SVOp), DAG);
6992
6993  //===--------------------------------------------------------------------===//
6994  // Since no target specific shuffle was selected for this generic one,
6995  // lower it into other known shuffles. FIXME: this isn't true yet, but
6996  // this is the plan.
6997  //
6998
6999  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
7000  if (VT == MVT::v8i16) {
7001    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
7002    if (NewOp.getNode())
7003      return NewOp;
7004  }
7005
7006  if (VT == MVT::v16i8) {
7007    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
7008    if (NewOp.getNode())
7009      return NewOp;
7010  }
7011
7012  if (VT == MVT::v32i8) {
7013    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
7014    if (NewOp.getNode())
7015      return NewOp;
7016  }
7017
7018  // Handle all 128-bit wide vectors with 4 elements, and match them with
7019  // several different shuffle types.
7020  if (NumElems == 4 && VT.is128BitVector())
7021    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
7022
7023  // Handle general 256-bit shuffles
7024  if (VT.is256BitVector())
7025    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
7026
7027  return SDValue();
7028}
7029
7030SDValue
7031X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
7032                                                SelectionDAG &DAG) const {
7033  EVT VT = Op.getValueType();
7034  DebugLoc dl = Op.getDebugLoc();
7035
7036  if (!Op.getOperand(0).getValueType().is128BitVector())
7037    return SDValue();
7038
7039  if (VT.getSizeInBits() == 8) {
7040    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
7041                                  Op.getOperand(0), Op.getOperand(1));
7042    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7043                                  DAG.getValueType(VT));
7044    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7045  }
7046
7047  if (VT.getSizeInBits() == 16) {
7048    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7049    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
7050    if (Idx == 0)
7051      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7052                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7053                                     DAG.getNode(ISD::BITCAST, dl,
7054                                                 MVT::v4i32,
7055                                                 Op.getOperand(0)),
7056                                     Op.getOperand(1)));
7057    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
7058                                  Op.getOperand(0), Op.getOperand(1));
7059    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
7060                                  DAG.getValueType(VT));
7061    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7062  }
7063
7064  if (VT == MVT::f32) {
7065    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
7066    // the result back to FR32 register. It's only worth matching if the
7067    // result has a single use which is a store or a bitcast to i32.  And in
7068    // the case of a store, it's not worth it if the index is a constant 0,
7069    // because a MOVSSmr can be used instead, which is smaller and faster.
7070    if (!Op.hasOneUse())
7071      return SDValue();
7072    SDNode *User = *Op.getNode()->use_begin();
7073    if ((User->getOpcode() != ISD::STORE ||
7074         (isa<ConstantSDNode>(Op.getOperand(1)) &&
7075          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
7076        (User->getOpcode() != ISD::BITCAST ||
7077         User->getValueType(0) != MVT::i32))
7078      return SDValue();
7079    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7080                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
7081                                              Op.getOperand(0)),
7082                                              Op.getOperand(1));
7083    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
7084  }
7085
7086  if (VT == MVT::i32 || VT == MVT::i64) {
7087    // ExtractPS/pextrq works with constant index.
7088    if (isa<ConstantSDNode>(Op.getOperand(1)))
7089      return Op;
7090  }
7091  return SDValue();
7092}
7093
7094SDValue
7095X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
7096                                           SelectionDAG &DAG) const {
7097  if (!isa<ConstantSDNode>(Op.getOperand(1)))
7098    return SDValue();
7099
7100  SDValue Vec = Op.getOperand(0);
7101  EVT VecVT = Vec.getValueType();
7102
7103  // If this is a 256-bit vector result, first extract the 128-bit vector and
7104  // then extract the element from the 128-bit vector.
7105  if (VecVT.is256BitVector()) {
7106    DebugLoc dl = Op.getNode()->getDebugLoc();
7107    unsigned NumElems = VecVT.getVectorNumElements();
7108    SDValue Idx = Op.getOperand(1);
7109    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7110
7111    // Get the 128-bit vector.
7112    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
7113
7114    if (IdxVal >= NumElems/2)
7115      IdxVal -= NumElems/2;
7116    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
7117                       DAG.getConstant(IdxVal, MVT::i32));
7118  }
7119
7120  assert(VecVT.is128BitVector() && "Unexpected vector length");
7121
7122  if (Subtarget->hasSSE41()) {
7123    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
7124    if (Res.getNode())
7125      return Res;
7126  }
7127
7128  EVT VT = Op.getValueType();
7129  DebugLoc dl = Op.getDebugLoc();
7130  // TODO: handle v16i8.
7131  if (VT.getSizeInBits() == 16) {
7132    SDValue Vec = Op.getOperand(0);
7133    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7134    if (Idx == 0)
7135      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7136                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7137                                     DAG.getNode(ISD::BITCAST, dl,
7138                                                 MVT::v4i32, Vec),
7139                                     Op.getOperand(1)));
7140    // Transform it so it match pextrw which produces a 32-bit result.
7141    EVT EltVT = MVT::i32;
7142    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
7143                                  Op.getOperand(0), Op.getOperand(1));
7144    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
7145                                  DAG.getValueType(VT));
7146    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7147  }
7148
7149  if (VT.getSizeInBits() == 32) {
7150    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7151    if (Idx == 0)
7152      return Op;
7153
7154    // SHUFPS the element to the lowest double word, then movss.
7155    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
7156    EVT VVT = Op.getOperand(0).getValueType();
7157    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7158                                       DAG.getUNDEF(VVT), Mask);
7159    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7160                       DAG.getIntPtrConstant(0));
7161  }
7162
7163  if (VT.getSizeInBits() == 64) {
7164    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
7165    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
7166    //        to match extract_elt for f64.
7167    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7168    if (Idx == 0)
7169      return Op;
7170
7171    // UNPCKHPD the element to the lowest double word, then movsd.
7172    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
7173    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
7174    int Mask[2] = { 1, -1 };
7175    EVT VVT = Op.getOperand(0).getValueType();
7176    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7177                                       DAG.getUNDEF(VVT), Mask);
7178    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7179                       DAG.getIntPtrConstant(0));
7180  }
7181
7182  return SDValue();
7183}
7184
7185SDValue
7186X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
7187                                               SelectionDAG &DAG) const {
7188  EVT VT = Op.getValueType();
7189  EVT EltVT = VT.getVectorElementType();
7190  DebugLoc dl = Op.getDebugLoc();
7191
7192  SDValue N0 = Op.getOperand(0);
7193  SDValue N1 = Op.getOperand(1);
7194  SDValue N2 = Op.getOperand(2);
7195
7196  if (!VT.is128BitVector())
7197    return SDValue();
7198
7199  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
7200      isa<ConstantSDNode>(N2)) {
7201    unsigned Opc;
7202    if (VT == MVT::v8i16)
7203      Opc = X86ISD::PINSRW;
7204    else if (VT == MVT::v16i8)
7205      Opc = X86ISD::PINSRB;
7206    else
7207      Opc = X86ISD::PINSRB;
7208
7209    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
7210    // argument.
7211    if (N1.getValueType() != MVT::i32)
7212      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7213    if (N2.getValueType() != MVT::i32)
7214      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7215    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
7216  }
7217
7218  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
7219    // Bits [7:6] of the constant are the source select.  This will always be
7220    //  zero here.  The DAG Combiner may combine an extract_elt index into these
7221    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
7222    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
7223    // Bits [5:4] of the constant are the destination select.  This is the
7224    //  value of the incoming immediate.
7225    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
7226    //   combine either bitwise AND or insert of float 0.0 to set these bits.
7227    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
7228    // Create this as a scalar to vector..
7229    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
7230    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
7231  }
7232
7233  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
7234    // PINSR* works with constant index.
7235    return Op;
7236  }
7237  return SDValue();
7238}
7239
7240SDValue
7241X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
7242  EVT VT = Op.getValueType();
7243  EVT EltVT = VT.getVectorElementType();
7244
7245  DebugLoc dl = Op.getDebugLoc();
7246  SDValue N0 = Op.getOperand(0);
7247  SDValue N1 = Op.getOperand(1);
7248  SDValue N2 = Op.getOperand(2);
7249
7250  // If this is a 256-bit vector result, first extract the 128-bit vector,
7251  // insert the element into the extracted half and then place it back.
7252  if (VT.is256BitVector()) {
7253    if (!isa<ConstantSDNode>(N2))
7254      return SDValue();
7255
7256    // Get the desired 128-bit vector half.
7257    unsigned NumElems = VT.getVectorNumElements();
7258    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
7259    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
7260
7261    // Insert the element into the desired half.
7262    bool Upper = IdxVal >= NumElems/2;
7263    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
7264                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
7265
7266    // Insert the changed part back to the 256-bit vector
7267    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
7268  }
7269
7270  if (Subtarget->hasSSE41())
7271    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
7272
7273  if (EltVT == MVT::i8)
7274    return SDValue();
7275
7276  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
7277    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
7278    // as its second argument.
7279    if (N1.getValueType() != MVT::i32)
7280      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7281    if (N2.getValueType() != MVT::i32)
7282      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7283    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
7284  }
7285  return SDValue();
7286}
7287
7288static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
7289  LLVMContext *Context = DAG.getContext();
7290  DebugLoc dl = Op.getDebugLoc();
7291  EVT OpVT = Op.getValueType();
7292
7293  // If this is a 256-bit vector result, first insert into a 128-bit
7294  // vector and then insert into the 256-bit vector.
7295  if (!OpVT.is128BitVector()) {
7296    // Insert into a 128-bit vector.
7297    EVT VT128 = EVT::getVectorVT(*Context,
7298                                 OpVT.getVectorElementType(),
7299                                 OpVT.getVectorNumElements() / 2);
7300
7301    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
7302
7303    // Insert the 128-bit vector.
7304    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
7305  }
7306
7307  if (OpVT == MVT::v1i64 &&
7308      Op.getOperand(0).getValueType() == MVT::i64)
7309    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
7310
7311  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
7312  assert(OpVT.is128BitVector() && "Expected an SSE type!");
7313  return DAG.getNode(ISD::BITCAST, dl, OpVT,
7314                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
7315}
7316
7317// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
7318// a simple subregister reference or explicit instructions to grab
7319// upper bits of a vector.
7320static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7321                                      SelectionDAG &DAG) {
7322  if (Subtarget->hasFp256()) {
7323    DebugLoc dl = Op.getNode()->getDebugLoc();
7324    SDValue Vec = Op.getNode()->getOperand(0);
7325    SDValue Idx = Op.getNode()->getOperand(1);
7326
7327    if (Op.getNode()->getValueType(0).is128BitVector() &&
7328        Vec.getNode()->getValueType(0).is256BitVector() &&
7329        isa<ConstantSDNode>(Idx)) {
7330      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7331      return Extract128BitVector(Vec, IdxVal, DAG, dl);
7332    }
7333  }
7334  return SDValue();
7335}
7336
7337// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
7338// simple superregister reference or explicit instructions to insert
7339// the upper bits of a vector.
7340static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
7341                                     SelectionDAG &DAG) {
7342  if (Subtarget->hasFp256()) {
7343    DebugLoc dl = Op.getNode()->getDebugLoc();
7344    SDValue Vec = Op.getNode()->getOperand(0);
7345    SDValue SubVec = Op.getNode()->getOperand(1);
7346    SDValue Idx = Op.getNode()->getOperand(2);
7347
7348    if (Op.getNode()->getValueType(0).is256BitVector() &&
7349        SubVec.getNode()->getValueType(0).is128BitVector() &&
7350        isa<ConstantSDNode>(Idx)) {
7351      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7352      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7353    }
7354  }
7355  return SDValue();
7356}
7357
7358// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7359// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7360// one of the above mentioned nodes. It has to be wrapped because otherwise
7361// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7362// be used to form addressing mode. These wrapped nodes will be selected
7363// into MOV32ri.
7364SDValue
7365X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7366  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7367
7368  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7369  // global base reg.
7370  unsigned char OpFlag = 0;
7371  unsigned WrapperKind = X86ISD::Wrapper;
7372  CodeModel::Model M = getTargetMachine().getCodeModel();
7373
7374  if (Subtarget->isPICStyleRIPRel() &&
7375      (M == CodeModel::Small || M == CodeModel::Kernel))
7376    WrapperKind = X86ISD::WrapperRIP;
7377  else if (Subtarget->isPICStyleGOT())
7378    OpFlag = X86II::MO_GOTOFF;
7379  else if (Subtarget->isPICStyleStubPIC())
7380    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7381
7382  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7383                                             CP->getAlignment(),
7384                                             CP->getOffset(), OpFlag);
7385  DebugLoc DL = CP->getDebugLoc();
7386  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7387  // With PIC, the address is actually $g + Offset.
7388  if (OpFlag) {
7389    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7390                         DAG.getNode(X86ISD::GlobalBaseReg,
7391                                     DebugLoc(), getPointerTy()),
7392                         Result);
7393  }
7394
7395  return Result;
7396}
7397
7398SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7399  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7400
7401  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7402  // global base reg.
7403  unsigned char OpFlag = 0;
7404  unsigned WrapperKind = X86ISD::Wrapper;
7405  CodeModel::Model M = getTargetMachine().getCodeModel();
7406
7407  if (Subtarget->isPICStyleRIPRel() &&
7408      (M == CodeModel::Small || M == CodeModel::Kernel))
7409    WrapperKind = X86ISD::WrapperRIP;
7410  else if (Subtarget->isPICStyleGOT())
7411    OpFlag = X86II::MO_GOTOFF;
7412  else if (Subtarget->isPICStyleStubPIC())
7413    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7414
7415  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7416                                          OpFlag);
7417  DebugLoc DL = JT->getDebugLoc();
7418  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7419
7420  // With PIC, the address is actually $g + Offset.
7421  if (OpFlag)
7422    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7423                         DAG.getNode(X86ISD::GlobalBaseReg,
7424                                     DebugLoc(), getPointerTy()),
7425                         Result);
7426
7427  return Result;
7428}
7429
7430SDValue
7431X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
7432  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
7433
7434  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7435  // global base reg.
7436  unsigned char OpFlag = 0;
7437  unsigned WrapperKind = X86ISD::Wrapper;
7438  CodeModel::Model M = getTargetMachine().getCodeModel();
7439
7440  if (Subtarget->isPICStyleRIPRel() &&
7441      (M == CodeModel::Small || M == CodeModel::Kernel)) {
7442    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
7443      OpFlag = X86II::MO_GOTPCREL;
7444    WrapperKind = X86ISD::WrapperRIP;
7445  } else if (Subtarget->isPICStyleGOT()) {
7446    OpFlag = X86II::MO_GOT;
7447  } else if (Subtarget->isPICStyleStubPIC()) {
7448    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
7449  } else if (Subtarget->isPICStyleStubNoDynamic()) {
7450    OpFlag = X86II::MO_DARWIN_NONLAZY;
7451  }
7452
7453  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
7454
7455  DebugLoc DL = Op.getDebugLoc();
7456  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7457
7458  // With PIC, the address is actually $g + Offset.
7459  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
7460      !Subtarget->is64Bit()) {
7461    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7462                         DAG.getNode(X86ISD::GlobalBaseReg,
7463                                     DebugLoc(), getPointerTy()),
7464                         Result);
7465  }
7466
7467  // For symbols that require a load from a stub to get the address, emit the
7468  // load.
7469  if (isGlobalStubReference(OpFlag))
7470    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
7471                         MachinePointerInfo::getGOT(), false, false, false, 0);
7472
7473  return Result;
7474}
7475
7476SDValue
7477X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
7478  // Create the TargetBlockAddressAddress node.
7479  unsigned char OpFlags =
7480    Subtarget->ClassifyBlockAddressReference();
7481  CodeModel::Model M = getTargetMachine().getCodeModel();
7482  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
7483  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
7484  DebugLoc dl = Op.getDebugLoc();
7485  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
7486                                             OpFlags);
7487
7488  if (Subtarget->isPICStyleRIPRel() &&
7489      (M == CodeModel::Small || M == CodeModel::Kernel))
7490    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7491  else
7492    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7493
7494  // With PIC, the address is actually $g + Offset.
7495  if (isGlobalRelativeToPICBase(OpFlags)) {
7496    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7497                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7498                         Result);
7499  }
7500
7501  return Result;
7502}
7503
7504SDValue
7505X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
7506                                      int64_t Offset,
7507                                      SelectionDAG &DAG) const {
7508  // Create the TargetGlobalAddress node, folding in the constant
7509  // offset if it is legal.
7510  unsigned char OpFlags =
7511    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7512  CodeModel::Model M = getTargetMachine().getCodeModel();
7513  SDValue Result;
7514  if (OpFlags == X86II::MO_NO_FLAG &&
7515      X86::isOffsetSuitableForCodeModel(Offset, M)) {
7516    // A direct static reference to a global.
7517    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
7518    Offset = 0;
7519  } else {
7520    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
7521  }
7522
7523  if (Subtarget->isPICStyleRIPRel() &&
7524      (M == CodeModel::Small || M == CodeModel::Kernel))
7525    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7526  else
7527    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7528
7529  // With PIC, the address is actually $g + Offset.
7530  if (isGlobalRelativeToPICBase(OpFlags)) {
7531    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7532                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7533                         Result);
7534  }
7535
7536  // For globals that require a load from a stub to get the address, emit the
7537  // load.
7538  if (isGlobalStubReference(OpFlags))
7539    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
7540                         MachinePointerInfo::getGOT(), false, false, false, 0);
7541
7542  // If there was a non-zero offset that we didn't fold, create an explicit
7543  // addition for it.
7544  if (Offset != 0)
7545    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
7546                         DAG.getConstant(Offset, getPointerTy()));
7547
7548  return Result;
7549}
7550
7551SDValue
7552X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
7553  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7554  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
7555  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
7556}
7557
7558static SDValue
7559GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
7560           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
7561           unsigned char OperandFlags, bool LocalDynamic = false) {
7562  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7563  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7564  DebugLoc dl = GA->getDebugLoc();
7565  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7566                                           GA->getValueType(0),
7567                                           GA->getOffset(),
7568                                           OperandFlags);
7569
7570  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
7571                                           : X86ISD::TLSADDR;
7572
7573  if (InFlag) {
7574    SDValue Ops[] = { Chain,  TGA, *InFlag };
7575    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
7576  } else {
7577    SDValue Ops[]  = { Chain, TGA };
7578    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
7579  }
7580
7581  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
7582  MFI->setAdjustsStack(true);
7583
7584  SDValue Flag = Chain.getValue(1);
7585  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
7586}
7587
7588// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
7589static SDValue
7590LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7591                                const EVT PtrVT) {
7592  SDValue InFlag;
7593  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
7594  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7595                                   DAG.getNode(X86ISD::GlobalBaseReg,
7596                                               DebugLoc(), PtrVT), InFlag);
7597  InFlag = Chain.getValue(1);
7598
7599  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
7600}
7601
7602// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
7603static SDValue
7604LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7605                                const EVT PtrVT) {
7606  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
7607                    X86::RAX, X86II::MO_TLSGD);
7608}
7609
7610static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
7611                                           SelectionDAG &DAG,
7612                                           const EVT PtrVT,
7613                                           bool is64Bit) {
7614  DebugLoc dl = GA->getDebugLoc();
7615
7616  // Get the start address of the TLS block for this module.
7617  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
7618      .getInfo<X86MachineFunctionInfo>();
7619  MFI->incNumLocalDynamicTLSAccesses();
7620
7621  SDValue Base;
7622  if (is64Bit) {
7623    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
7624                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
7625  } else {
7626    SDValue InFlag;
7627    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7628        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
7629    InFlag = Chain.getValue(1);
7630    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
7631                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
7632  }
7633
7634  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
7635  // of Base.
7636
7637  // Build x@dtpoff.
7638  unsigned char OperandFlags = X86II::MO_DTPOFF;
7639  unsigned WrapperKind = X86ISD::Wrapper;
7640  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7641                                           GA->getValueType(0),
7642                                           GA->getOffset(), OperandFlags);
7643  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7644
7645  // Add x@dtpoff with the base.
7646  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
7647}
7648
7649// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
7650static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7651                                   const EVT PtrVT, TLSModel::Model model,
7652                                   bool is64Bit, bool isPIC) {
7653  DebugLoc dl = GA->getDebugLoc();
7654
7655  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
7656  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
7657                                                         is64Bit ? 257 : 256));
7658
7659  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
7660                                      DAG.getIntPtrConstant(0),
7661                                      MachinePointerInfo(Ptr),
7662                                      false, false, false, 0);
7663
7664  unsigned char OperandFlags = 0;
7665  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
7666  // initialexec.
7667  unsigned WrapperKind = X86ISD::Wrapper;
7668  if (model == TLSModel::LocalExec) {
7669    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
7670  } else if (model == TLSModel::InitialExec) {
7671    if (is64Bit) {
7672      OperandFlags = X86II::MO_GOTTPOFF;
7673      WrapperKind = X86ISD::WrapperRIP;
7674    } else {
7675      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
7676    }
7677  } else {
7678    llvm_unreachable("Unexpected model");
7679  }
7680
7681  // emit "addl x@ntpoff,%eax" (local exec)
7682  // or "addl x@indntpoff,%eax" (initial exec)
7683  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
7684  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7685                                           GA->getValueType(0),
7686                                           GA->getOffset(), OperandFlags);
7687  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7688
7689  if (model == TLSModel::InitialExec) {
7690    if (isPIC && !is64Bit) {
7691      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
7692                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
7693                           Offset);
7694    }
7695
7696    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
7697                         MachinePointerInfo::getGOT(), false, false, false,
7698                         0);
7699  }
7700
7701  // The address of the thread local variable is the add of the thread
7702  // pointer with the offset of the variable.
7703  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
7704}
7705
7706SDValue
7707X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
7708
7709  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7710  const GlobalValue *GV = GA->getGlobal();
7711
7712  if (Subtarget->isTargetELF()) {
7713    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
7714
7715    switch (model) {
7716      case TLSModel::GeneralDynamic:
7717        if (Subtarget->is64Bit())
7718          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
7719        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
7720      case TLSModel::LocalDynamic:
7721        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
7722                                           Subtarget->is64Bit());
7723      case TLSModel::InitialExec:
7724      case TLSModel::LocalExec:
7725        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
7726                                   Subtarget->is64Bit(),
7727                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
7728    }
7729    llvm_unreachable("Unknown TLS model.");
7730  }
7731
7732  if (Subtarget->isTargetDarwin()) {
7733    // Darwin only has one model of TLS.  Lower to that.
7734    unsigned char OpFlag = 0;
7735    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
7736                           X86ISD::WrapperRIP : X86ISD::Wrapper;
7737
7738    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7739    // global base reg.
7740    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
7741                  !Subtarget->is64Bit();
7742    if (PIC32)
7743      OpFlag = X86II::MO_TLVP_PIC_BASE;
7744    else
7745      OpFlag = X86II::MO_TLVP;
7746    DebugLoc DL = Op.getDebugLoc();
7747    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
7748                                                GA->getValueType(0),
7749                                                GA->getOffset(), OpFlag);
7750    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7751
7752    // With PIC32, the address is actually $g + Offset.
7753    if (PIC32)
7754      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7755                           DAG.getNode(X86ISD::GlobalBaseReg,
7756                                       DebugLoc(), getPointerTy()),
7757                           Offset);
7758
7759    // Lowering the machine isd will make sure everything is in the right
7760    // location.
7761    SDValue Chain = DAG.getEntryNode();
7762    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7763    SDValue Args[] = { Chain, Offset };
7764    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
7765
7766    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
7767    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7768    MFI->setAdjustsStack(true);
7769
7770    // And our return value (tls address) is in the standard call return value
7771    // location.
7772    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
7773    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
7774                              Chain.getValue(1));
7775  }
7776
7777  if (Subtarget->isTargetWindows()) {
7778    // Just use the implicit TLS architecture
7779    // Need to generate someting similar to:
7780    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
7781    //                                  ; from TEB
7782    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
7783    //   mov     rcx, qword [rdx+rcx*8]
7784    //   mov     eax, .tls$:tlsvar
7785    //   [rax+rcx] contains the address
7786    // Windows 64bit: gs:0x58
7787    // Windows 32bit: fs:__tls_array
7788
7789    // If GV is an alias then use the aliasee for determining
7790    // thread-localness.
7791    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7792      GV = GA->resolveAliasedGlobal(false);
7793    DebugLoc dl = GA->getDebugLoc();
7794    SDValue Chain = DAG.getEntryNode();
7795
7796    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
7797    // %gs:0x58 (64-bit).
7798    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
7799                                        ? Type::getInt8PtrTy(*DAG.getContext(),
7800                                                             256)
7801                                        : Type::getInt32PtrTy(*DAG.getContext(),
7802                                                              257));
7803
7804    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
7805                                        Subtarget->is64Bit()
7806                                        ? DAG.getIntPtrConstant(0x58)
7807                                        : DAG.getExternalSymbol("_tls_array",
7808                                                                getPointerTy()),
7809                                        MachinePointerInfo(Ptr),
7810                                        false, false, false, 0);
7811
7812    // Load the _tls_index variable
7813    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
7814    if (Subtarget->is64Bit())
7815      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
7816                           IDX, MachinePointerInfo(), MVT::i32,
7817                           false, false, 0);
7818    else
7819      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
7820                        false, false, false, 0);
7821
7822    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
7823                                    getPointerTy());
7824    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
7825
7826    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
7827    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
7828                      false, false, false, 0);
7829
7830    // Get the offset of start of .tls section
7831    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7832                                             GA->getValueType(0),
7833                                             GA->getOffset(), X86II::MO_SECREL);
7834    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
7835
7836    // The address of the thread local variable is the add of the thread
7837    // pointer with the offset of the variable.
7838    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
7839  }
7840
7841  llvm_unreachable("TLS not implemented for this target.");
7842}
7843
7844/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
7845/// and take a 2 x i32 value to shift plus a shift amount.
7846SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
7847  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7848  EVT VT = Op.getValueType();
7849  unsigned VTBits = VT.getSizeInBits();
7850  DebugLoc dl = Op.getDebugLoc();
7851  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
7852  SDValue ShOpLo = Op.getOperand(0);
7853  SDValue ShOpHi = Op.getOperand(1);
7854  SDValue ShAmt  = Op.getOperand(2);
7855  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
7856                                     DAG.getConstant(VTBits - 1, MVT::i8))
7857                       : DAG.getConstant(0, VT);
7858
7859  SDValue Tmp2, Tmp3;
7860  if (Op.getOpcode() == ISD::SHL_PARTS) {
7861    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
7862    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7863  } else {
7864    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
7865    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
7866  }
7867
7868  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
7869                                DAG.getConstant(VTBits, MVT::i8));
7870  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7871                             AndNode, DAG.getConstant(0, MVT::i8));
7872
7873  SDValue Hi, Lo;
7874  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7875  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
7876  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
7877
7878  if (Op.getOpcode() == ISD::SHL_PARTS) {
7879    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7880    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7881  } else {
7882    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7883    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7884  }
7885
7886  SDValue Ops[2] = { Lo, Hi };
7887  return DAG.getMergeValues(Ops, 2, dl);
7888}
7889
7890SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
7891                                           SelectionDAG &DAG) const {
7892  EVT SrcVT = Op.getOperand(0).getValueType();
7893
7894  if (SrcVT.isVector())
7895    return SDValue();
7896
7897  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
7898         "Unknown SINT_TO_FP to lower!");
7899
7900  // These are really Legal; return the operand so the caller accepts it as
7901  // Legal.
7902  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
7903    return Op;
7904  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
7905      Subtarget->is64Bit()) {
7906    return Op;
7907  }
7908
7909  DebugLoc dl = Op.getDebugLoc();
7910  unsigned Size = SrcVT.getSizeInBits()/8;
7911  MachineFunction &MF = DAG.getMachineFunction();
7912  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
7913  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7914  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7915                               StackSlot,
7916                               MachinePointerInfo::getFixedStack(SSFI),
7917                               false, false, 0);
7918  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
7919}
7920
7921SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
7922                                     SDValue StackSlot,
7923                                     SelectionDAG &DAG) const {
7924  // Build the FILD
7925  DebugLoc DL = Op.getDebugLoc();
7926  SDVTList Tys;
7927  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
7928  if (useSSE)
7929    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
7930  else
7931    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
7932
7933  unsigned ByteSize = SrcVT.getSizeInBits()/8;
7934
7935  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
7936  MachineMemOperand *MMO;
7937  if (FI) {
7938    int SSFI = FI->getIndex();
7939    MMO =
7940      DAG.getMachineFunction()
7941      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7942                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
7943  } else {
7944    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
7945    StackSlot = StackSlot.getOperand(1);
7946  }
7947  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
7948  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
7949                                           X86ISD::FILD, DL,
7950                                           Tys, Ops, array_lengthof(Ops),
7951                                           SrcVT, MMO);
7952
7953  if (useSSE) {
7954    Chain = Result.getValue(1);
7955    SDValue InFlag = Result.getValue(2);
7956
7957    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
7958    // shouldn't be necessary except that RFP cannot be live across
7959    // multiple blocks. When stackifier is fixed, they can be uncoupled.
7960    MachineFunction &MF = DAG.getMachineFunction();
7961    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
7962    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
7963    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7964    Tys = DAG.getVTList(MVT::Other);
7965    SDValue Ops[] = {
7966      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
7967    };
7968    MachineMemOperand *MMO =
7969      DAG.getMachineFunction()
7970      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7971                            MachineMemOperand::MOStore, SSFISize, SSFISize);
7972
7973    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
7974                                    Ops, array_lengthof(Ops),
7975                                    Op.getValueType(), MMO);
7976    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
7977                         MachinePointerInfo::getFixedStack(SSFI),
7978                         false, false, false, 0);
7979  }
7980
7981  return Result;
7982}
7983
7984// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
7985SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
7986                                               SelectionDAG &DAG) const {
7987  // This algorithm is not obvious. Here it is what we're trying to output:
7988  /*
7989     movq       %rax,  %xmm0
7990     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
7991     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
7992     #ifdef __SSE3__
7993       haddpd   %xmm0, %xmm0
7994     #else
7995       pshufd   $0x4e, %xmm0, %xmm1
7996       addpd    %xmm1, %xmm0
7997     #endif
7998  */
7999
8000  DebugLoc dl = Op.getDebugLoc();
8001  LLVMContext *Context = DAG.getContext();
8002
8003  // Build some magic constants.
8004  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
8005  Constant *C0 = ConstantDataVector::get(*Context, CV0);
8006  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
8007
8008  SmallVector<Constant*,2> CV1;
8009  CV1.push_back(
8010        ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
8011  CV1.push_back(
8012        ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
8013  Constant *C1 = ConstantVector::get(CV1);
8014  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
8015
8016  // Load the 64-bit value into an XMM register.
8017  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
8018                            Op.getOperand(0));
8019  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
8020                              MachinePointerInfo::getConstantPool(),
8021                              false, false, false, 16);
8022  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
8023                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
8024                              CLod0);
8025
8026  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
8027                              MachinePointerInfo::getConstantPool(),
8028                              false, false, false, 16);
8029  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
8030  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
8031  SDValue Result;
8032
8033  if (Subtarget->hasSSE3()) {
8034    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
8035    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
8036  } else {
8037    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
8038    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
8039                                           S2F, 0x4E, DAG);
8040    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
8041                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
8042                         Sub);
8043  }
8044
8045  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
8046                     DAG.getIntPtrConstant(0));
8047}
8048
8049// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
8050SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
8051                                               SelectionDAG &DAG) const {
8052  DebugLoc dl = Op.getDebugLoc();
8053  // FP constant to bias correct the final result.
8054  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
8055                                   MVT::f64);
8056
8057  // Load the 32-bit value into an XMM register.
8058  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
8059                             Op.getOperand(0));
8060
8061  // Zero out the upper parts of the register.
8062  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
8063
8064  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8065                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
8066                     DAG.getIntPtrConstant(0));
8067
8068  // Or the load with the bias.
8069  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
8070                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8071                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8072                                                   MVT::v2f64, Load)),
8073                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
8074                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8075                                                   MVT::v2f64, Bias)));
8076  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8077                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
8078                   DAG.getIntPtrConstant(0));
8079
8080  // Subtract the bias.
8081  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
8082
8083  // Handle final rounding.
8084  EVT DestVT = Op.getValueType();
8085
8086  if (DestVT.bitsLT(MVT::f64))
8087    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
8088                       DAG.getIntPtrConstant(0));
8089  if (DestVT.bitsGT(MVT::f64))
8090    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
8091
8092  // Handle final rounding.
8093  return Sub;
8094}
8095
8096SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
8097                                               SelectionDAG &DAG) const {
8098  SDValue N0 = Op.getOperand(0);
8099  EVT SVT = N0.getValueType();
8100  DebugLoc dl = Op.getDebugLoc();
8101
8102  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
8103          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
8104         "Custom UINT_TO_FP is not supported!");
8105
8106  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
8107  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
8108                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
8109}
8110
8111SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
8112                                           SelectionDAG &DAG) const {
8113  SDValue N0 = Op.getOperand(0);
8114  DebugLoc dl = Op.getDebugLoc();
8115
8116  if (Op.getValueType().isVector())
8117    return lowerUINT_TO_FP_vec(Op, DAG);
8118
8119  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
8120  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
8121  // the optimization here.
8122  if (DAG.SignBitIsZero(N0))
8123    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
8124
8125  EVT SrcVT = N0.getValueType();
8126  EVT DstVT = Op.getValueType();
8127  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
8128    return LowerUINT_TO_FP_i64(Op, DAG);
8129  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
8130    return LowerUINT_TO_FP_i32(Op, DAG);
8131  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
8132    return SDValue();
8133
8134  // Make a 64-bit buffer, and use it to build an FILD.
8135  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
8136  if (SrcVT == MVT::i32) {
8137    SDValue WordOff = DAG.getConstant(4, getPointerTy());
8138    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
8139                                     getPointerTy(), StackSlot, WordOff);
8140    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8141                                  StackSlot, MachinePointerInfo(),
8142                                  false, false, 0);
8143    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
8144                                  OffsetSlot, MachinePointerInfo(),
8145                                  false, false, 0);
8146    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
8147    return Fild;
8148  }
8149
8150  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
8151  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8152                               StackSlot, MachinePointerInfo(),
8153                               false, false, 0);
8154  // For i64 source, we need to add the appropriate power of 2 if the input
8155  // was negative.  This is the same as the optimization in
8156  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
8157  // we must be careful to do the computation in x87 extended precision, not
8158  // in SSE. (The generic code can't know it's OK to do this, or how to.)
8159  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
8160  MachineMemOperand *MMO =
8161    DAG.getMachineFunction()
8162    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8163                          MachineMemOperand::MOLoad, 8, 8);
8164
8165  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
8166  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
8167  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
8168                                         MVT::i64, MMO);
8169
8170  APInt FF(32, 0x5F800000ULL);
8171
8172  // Check whether the sign bit is set.
8173  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
8174                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
8175                                 ISD::SETLT);
8176
8177  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
8178  SDValue FudgePtr = DAG.getConstantPool(
8179                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
8180                                         getPointerTy());
8181
8182  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
8183  SDValue Zero = DAG.getIntPtrConstant(0);
8184  SDValue Four = DAG.getIntPtrConstant(4);
8185  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
8186                               Zero, Four);
8187  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
8188
8189  // Load the value out, extending it from f32 to f80.
8190  // FIXME: Avoid the extend by constructing the right constant pool?
8191  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
8192                                 FudgePtr, MachinePointerInfo::getConstantPool(),
8193                                 MVT::f32, false, false, 4);
8194  // Extend everything to 80 bits to force it to be done on x87.
8195  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
8196  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
8197}
8198
8199std::pair<SDValue,SDValue> X86TargetLowering::
8200FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
8201  DebugLoc DL = Op.getDebugLoc();
8202
8203  EVT DstTy = Op.getValueType();
8204
8205  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
8206    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
8207    DstTy = MVT::i64;
8208  }
8209
8210  assert(DstTy.getSimpleVT() <= MVT::i64 &&
8211         DstTy.getSimpleVT() >= MVT::i16 &&
8212         "Unknown FP_TO_INT to lower!");
8213
8214  // These are really Legal.
8215  if (DstTy == MVT::i32 &&
8216      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8217    return std::make_pair(SDValue(), SDValue());
8218  if (Subtarget->is64Bit() &&
8219      DstTy == MVT::i64 &&
8220      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8221    return std::make_pair(SDValue(), SDValue());
8222
8223  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
8224  // stack slot, or into the FTOL runtime function.
8225  MachineFunction &MF = DAG.getMachineFunction();
8226  unsigned MemSize = DstTy.getSizeInBits()/8;
8227  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8228  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8229
8230  unsigned Opc;
8231  if (!IsSigned && isIntegerTypeFTOL(DstTy))
8232    Opc = X86ISD::WIN_FTOL;
8233  else
8234    switch (DstTy.getSimpleVT().SimpleTy) {
8235    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
8236    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
8237    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
8238    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
8239    }
8240
8241  SDValue Chain = DAG.getEntryNode();
8242  SDValue Value = Op.getOperand(0);
8243  EVT TheVT = Op.getOperand(0).getValueType();
8244  // FIXME This causes a redundant load/store if the SSE-class value is already
8245  // in memory, such as if it is on the callstack.
8246  if (isScalarFPTypeInSSEReg(TheVT)) {
8247    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
8248    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
8249                         MachinePointerInfo::getFixedStack(SSFI),
8250                         false, false, 0);
8251    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
8252    SDValue Ops[] = {
8253      Chain, StackSlot, DAG.getValueType(TheVT)
8254    };
8255
8256    MachineMemOperand *MMO =
8257      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8258                              MachineMemOperand::MOLoad, MemSize, MemSize);
8259    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
8260                                    DstTy, MMO);
8261    Chain = Value.getValue(1);
8262    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8263    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8264  }
8265
8266  MachineMemOperand *MMO =
8267    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8268                            MachineMemOperand::MOStore, MemSize, MemSize);
8269
8270  if (Opc != X86ISD::WIN_FTOL) {
8271    // Build the FP_TO_INT*_IN_MEM
8272    SDValue Ops[] = { Chain, Value, StackSlot };
8273    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
8274                                           Ops, 3, DstTy, MMO);
8275    return std::make_pair(FIST, StackSlot);
8276  } else {
8277    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
8278      DAG.getVTList(MVT::Other, MVT::Glue),
8279      Chain, Value);
8280    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
8281      MVT::i32, ftol.getValue(1));
8282    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
8283      MVT::i32, eax.getValue(2));
8284    SDValue Ops[] = { eax, edx };
8285    SDValue pair = IsReplace
8286      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
8287      : DAG.getMergeValues(Ops, 2, DL);
8288    return std::make_pair(pair, SDValue());
8289  }
8290}
8291
8292SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
8293  DebugLoc DL = Op.getDebugLoc();
8294  EVT VT = Op.getValueType();
8295  SDValue In = Op.getOperand(0);
8296  EVT SVT = In.getValueType();
8297
8298  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
8299      VT.getVectorNumElements() != SVT.getVectorNumElements())
8300    return SDValue();
8301
8302  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
8303
8304  // AVX2 has better support of integer extending.
8305  if (Subtarget->hasInt256())
8306    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
8307
8308  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
8309  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
8310  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
8311                           DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
8312
8313  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
8314}
8315
8316SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8317  DebugLoc DL = Op.getDebugLoc();
8318  EVT VT = Op.getValueType();
8319  SDValue In = Op.getOperand(0);
8320  EVT SVT = In.getValueType();
8321
8322  if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
8323    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
8324    if (Subtarget->hasInt256()) {
8325      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
8326      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
8327      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
8328                                ShufMask);
8329      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
8330                         DAG.getIntPtrConstant(0));
8331    }
8332
8333    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
8334    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
8335                               DAG.getIntPtrConstant(0));
8336    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
8337                               DAG.getIntPtrConstant(2));
8338
8339    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
8340    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
8341
8342    // The PSHUFD mask:
8343    static const int ShufMask1[] = {0, 2, 0, 0};
8344    SDValue Undef = DAG.getUNDEF(VT);
8345    OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
8346    OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
8347
8348    // The MOVLHPS mask:
8349    static const int ShufMask2[] = {0, 1, 4, 5};
8350    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
8351  }
8352
8353  if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
8354    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
8355    if (Subtarget->hasInt256()) {
8356      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
8357
8358      SmallVector<SDValue,32> pshufbMask;
8359      for (unsigned i = 0; i < 2; ++i) {
8360        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
8361        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
8362        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
8363        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
8364        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
8365        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
8366        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
8367        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
8368        for (unsigned j = 0; j < 8; ++j)
8369          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
8370      }
8371      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
8372                               &pshufbMask[0], 32);
8373      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
8374      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
8375
8376      static const int ShufMask[] = {0,  2,  -1,  -1};
8377      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
8378                                &ShufMask[0]);
8379      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
8380                       DAG.getIntPtrConstant(0));
8381      return DAG.getNode(ISD::BITCAST, DL, VT, In);
8382    }
8383
8384    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
8385                               DAG.getIntPtrConstant(0));
8386
8387    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
8388                               DAG.getIntPtrConstant(4));
8389
8390    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
8391    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
8392
8393    // The PSHUFB mask:
8394    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
8395                                   -1, -1, -1, -1, -1, -1, -1, -1};
8396
8397    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
8398    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
8399    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
8400
8401    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
8402    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
8403
8404    // The MOVLHPS Mask:
8405    static const int ShufMask2[] = {0, 1, 4, 5};
8406    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
8407    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
8408  }
8409
8410  // Handle truncation of V256 to V128 using shuffles.
8411  if (!VT.is128BitVector() || !SVT.is256BitVector())
8412    return SDValue();
8413
8414  assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
8415         "Invalid op");
8416  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
8417
8418  unsigned NumElems = VT.getVectorNumElements();
8419  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
8420                             NumElems * 2);
8421
8422  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
8423  // Prepare truncation shuffle mask
8424  for (unsigned i = 0; i != NumElems; ++i)
8425    MaskVec[i] = i * 2;
8426  SDValue V = DAG.getVectorShuffle(NVT, DL,
8427                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
8428                                   DAG.getUNDEF(NVT), &MaskVec[0]);
8429  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
8430                     DAG.getIntPtrConstant(0));
8431}
8432
8433SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
8434                                           SelectionDAG &DAG) const {
8435  if (Op.getValueType().isVector()) {
8436    if (Op.getValueType() == MVT::v8i16)
8437      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
8438                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
8439                                     MVT::v8i32, Op.getOperand(0)));
8440    return SDValue();
8441  }
8442
8443  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8444    /*IsSigned=*/ true, /*IsReplace=*/ false);
8445  SDValue FIST = Vals.first, StackSlot = Vals.second;
8446  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
8447  if (FIST.getNode() == 0) return Op;
8448
8449  if (StackSlot.getNode())
8450    // Load the result.
8451    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
8452                       FIST, StackSlot, MachinePointerInfo(),
8453                       false, false, false, 0);
8454
8455  // The node is the result.
8456  return FIST;
8457}
8458
8459SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
8460                                           SelectionDAG &DAG) const {
8461  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8462    /*IsSigned=*/ false, /*IsReplace=*/ false);
8463  SDValue FIST = Vals.first, StackSlot = Vals.second;
8464  assert(FIST.getNode() && "Unexpected failure");
8465
8466  if (StackSlot.getNode())
8467    // Load the result.
8468    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
8469                       FIST, StackSlot, MachinePointerInfo(),
8470                       false, false, false, 0);
8471
8472  // The node is the result.
8473  return FIST;
8474}
8475
8476SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
8477                                          SelectionDAG &DAG) const {
8478  DebugLoc DL = Op.getDebugLoc();
8479  EVT VT = Op.getValueType();
8480  SDValue In = Op.getOperand(0);
8481  EVT SVT = In.getValueType();
8482
8483  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
8484
8485  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
8486                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
8487                                 In, DAG.getUNDEF(SVT)));
8488}
8489
8490SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
8491  LLVMContext *Context = DAG.getContext();
8492  DebugLoc dl = Op.getDebugLoc();
8493  EVT VT = Op.getValueType();
8494  EVT EltVT = VT;
8495  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8496  if (VT.isVector()) {
8497    EltVT = VT.getVectorElementType();
8498    NumElts = VT.getVectorNumElements();
8499  }
8500  Constant *C;
8501  if (EltVT == MVT::f64)
8502    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
8503  else
8504    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
8505  C = ConstantVector::getSplat(NumElts, C);
8506  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
8507  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
8508  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8509                             MachinePointerInfo::getConstantPool(),
8510                             false, false, false, Alignment);
8511  if (VT.isVector()) {
8512    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8513    return DAG.getNode(ISD::BITCAST, dl, VT,
8514                       DAG.getNode(ISD::AND, dl, ANDVT,
8515                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
8516                                               Op.getOperand(0)),
8517                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
8518  }
8519  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
8520}
8521
8522SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
8523  LLVMContext *Context = DAG.getContext();
8524  DebugLoc dl = Op.getDebugLoc();
8525  EVT VT = Op.getValueType();
8526  EVT EltVT = VT;
8527  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8528  if (VT.isVector()) {
8529    EltVT = VT.getVectorElementType();
8530    NumElts = VT.getVectorNumElements();
8531  }
8532  Constant *C;
8533  if (EltVT == MVT::f64)
8534    C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
8535  else
8536    C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
8537  C = ConstantVector::getSplat(NumElts, C);
8538  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
8539  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
8540  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8541                             MachinePointerInfo::getConstantPool(),
8542                             false, false, false, Alignment);
8543  if (VT.isVector()) {
8544    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8545    return DAG.getNode(ISD::BITCAST, dl, VT,
8546                       DAG.getNode(ISD::XOR, dl, XORVT,
8547                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
8548                                               Op.getOperand(0)),
8549                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
8550  }
8551
8552  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
8553}
8554
8555SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8556  LLVMContext *Context = DAG.getContext();
8557  SDValue Op0 = Op.getOperand(0);
8558  SDValue Op1 = Op.getOperand(1);
8559  DebugLoc dl = Op.getDebugLoc();
8560  EVT VT = Op.getValueType();
8561  EVT SrcVT = Op1.getValueType();
8562
8563  // If second operand is smaller, extend it first.
8564  if (SrcVT.bitsLT(VT)) {
8565    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
8566    SrcVT = VT;
8567  }
8568  // And if it is bigger, shrink it first.
8569  if (SrcVT.bitsGT(VT)) {
8570    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
8571    SrcVT = VT;
8572  }
8573
8574  // At this point the operands and the result should have the same
8575  // type, and that won't be f80 since that is not custom lowered.
8576
8577  // First get the sign bit of second operand.
8578  SmallVector<Constant*,4> CV;
8579  if (SrcVT == MVT::f64) {
8580    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
8581    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8582  } else {
8583    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
8584    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8585    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8586    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8587  }
8588  Constant *C = ConstantVector::get(CV);
8589  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8590  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
8591                              MachinePointerInfo::getConstantPool(),
8592                              false, false, false, 16);
8593  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
8594
8595  // Shift sign bit right or left if the two operands have different types.
8596  if (SrcVT.bitsGT(VT)) {
8597    // Op0 is MVT::f32, Op1 is MVT::f64.
8598    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
8599    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
8600                          DAG.getConstant(32, MVT::i32));
8601    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
8602    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
8603                          DAG.getIntPtrConstant(0));
8604  }
8605
8606  // Clear first operand sign bit.
8607  CV.clear();
8608  if (VT == MVT::f64) {
8609    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
8610    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8611  } else {
8612    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
8613    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8614    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8615    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8616  }
8617  C = ConstantVector::get(CV);
8618  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8619  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8620                              MachinePointerInfo::getConstantPool(),
8621                              false, false, false, 16);
8622  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
8623
8624  // Or the value with the sign bit.
8625  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
8626}
8627
8628static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
8629  SDValue N0 = Op.getOperand(0);
8630  DebugLoc dl = Op.getDebugLoc();
8631  EVT VT = Op.getValueType();
8632
8633  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
8634  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
8635                                  DAG.getConstant(1, VT));
8636  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
8637}
8638
8639// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
8640//
8641SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
8642  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
8643
8644  if (!Subtarget->hasSSE41())
8645    return SDValue();
8646
8647  if (!Op->hasOneUse())
8648    return SDValue();
8649
8650  SDNode *N = Op.getNode();
8651  DebugLoc DL = N->getDebugLoc();
8652
8653  SmallVector<SDValue, 8> Opnds;
8654  DenseMap<SDValue, unsigned> VecInMap;
8655  EVT VT = MVT::Other;
8656
8657  // Recognize a special case where a vector is casted into wide integer to
8658  // test all 0s.
8659  Opnds.push_back(N->getOperand(0));
8660  Opnds.push_back(N->getOperand(1));
8661
8662  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
8663    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
8664    // BFS traverse all OR'd operands.
8665    if (I->getOpcode() == ISD::OR) {
8666      Opnds.push_back(I->getOperand(0));
8667      Opnds.push_back(I->getOperand(1));
8668      // Re-evaluate the number of nodes to be traversed.
8669      e += 2; // 2 more nodes (LHS and RHS) are pushed.
8670      continue;
8671    }
8672
8673    // Quit if a non-EXTRACT_VECTOR_ELT
8674    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8675      return SDValue();
8676
8677    // Quit if without a constant index.
8678    SDValue Idx = I->getOperand(1);
8679    if (!isa<ConstantSDNode>(Idx))
8680      return SDValue();
8681
8682    SDValue ExtractedFromVec = I->getOperand(0);
8683    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
8684    if (M == VecInMap.end()) {
8685      VT = ExtractedFromVec.getValueType();
8686      // Quit if not 128/256-bit vector.
8687      if (!VT.is128BitVector() && !VT.is256BitVector())
8688        return SDValue();
8689      // Quit if not the same type.
8690      if (VecInMap.begin() != VecInMap.end() &&
8691          VT != VecInMap.begin()->first.getValueType())
8692        return SDValue();
8693      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
8694    }
8695    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
8696  }
8697
8698  assert((VT.is128BitVector() || VT.is256BitVector()) &&
8699         "Not extracted from 128-/256-bit vector.");
8700
8701  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
8702  SmallVector<SDValue, 8> VecIns;
8703
8704  for (DenseMap<SDValue, unsigned>::const_iterator
8705        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
8706    // Quit if not all elements are used.
8707    if (I->second != FullMask)
8708      return SDValue();
8709    VecIns.push_back(I->first);
8710  }
8711
8712  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8713
8714  // Cast all vectors into TestVT for PTEST.
8715  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
8716    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
8717
8718  // If more than one full vectors are evaluated, OR them first before PTEST.
8719  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
8720    // Each iteration will OR 2 nodes and append the result until there is only
8721    // 1 node left, i.e. the final OR'd value of all vectors.
8722    SDValue LHS = VecIns[Slot];
8723    SDValue RHS = VecIns[Slot + 1];
8724    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
8725  }
8726
8727  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
8728                     VecIns.back(), VecIns.back());
8729}
8730
8731/// Emit nodes that will be selected as "test Op0,Op0", or something
8732/// equivalent.
8733SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
8734                                    SelectionDAG &DAG) const {
8735  DebugLoc dl = Op.getDebugLoc();
8736
8737  // CF and OF aren't always set the way we want. Determine which
8738  // of these we need.
8739  bool NeedCF = false;
8740  bool NeedOF = false;
8741  switch (X86CC) {
8742  default: break;
8743  case X86::COND_A: case X86::COND_AE:
8744  case X86::COND_B: case X86::COND_BE:
8745    NeedCF = true;
8746    break;
8747  case X86::COND_G: case X86::COND_GE:
8748  case X86::COND_L: case X86::COND_LE:
8749  case X86::COND_O: case X86::COND_NO:
8750    NeedOF = true;
8751    break;
8752  }
8753
8754  // See if we can use the EFLAGS value from the operand instead of
8755  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
8756  // we prove that the arithmetic won't overflow, we can't use OF or CF.
8757  if (Op.getResNo() != 0 || NeedOF || NeedCF)
8758    // Emit a CMP with 0, which is the TEST pattern.
8759    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8760                       DAG.getConstant(0, Op.getValueType()));
8761
8762  unsigned Opcode = 0;
8763  unsigned NumOperands = 0;
8764
8765  // Truncate operations may prevent the merge of the SETCC instruction
8766  // and the arithmetic intruction before it. Attempt to truncate the operands
8767  // of the arithmetic instruction and use a reduced bit-width instruction.
8768  bool NeedTruncation = false;
8769  SDValue ArithOp = Op;
8770  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
8771    SDValue Arith = Op->getOperand(0);
8772    // Both the trunc and the arithmetic op need to have one user each.
8773    if (Arith->hasOneUse())
8774      switch (Arith.getOpcode()) {
8775        default: break;
8776        case ISD::ADD:
8777        case ISD::SUB:
8778        case ISD::AND:
8779        case ISD::OR:
8780        case ISD::XOR: {
8781          NeedTruncation = true;
8782          ArithOp = Arith;
8783        }
8784      }
8785  }
8786
8787  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
8788  // which may be the result of a CAST.  We use the variable 'Op', which is the
8789  // non-casted variable when we check for possible users.
8790  switch (ArithOp.getOpcode()) {
8791  case ISD::ADD:
8792    // Due to an isel shortcoming, be conservative if this add is likely to be
8793    // selected as part of a load-modify-store instruction. When the root node
8794    // in a match is a store, isel doesn't know how to remap non-chain non-flag
8795    // uses of other nodes in the match, such as the ADD in this case. This
8796    // leads to the ADD being left around and reselected, with the result being
8797    // two adds in the output.  Alas, even if none our users are stores, that
8798    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
8799    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
8800    // climbing the DAG back to the root, and it doesn't seem to be worth the
8801    // effort.
8802    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8803         UE = Op.getNode()->use_end(); UI != UE; ++UI)
8804      if (UI->getOpcode() != ISD::CopyToReg &&
8805          UI->getOpcode() != ISD::SETCC &&
8806          UI->getOpcode() != ISD::STORE)
8807        goto default_case;
8808
8809    if (ConstantSDNode *C =
8810        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
8811      // An add of one will be selected as an INC.
8812      if (C->getAPIntValue() == 1) {
8813        Opcode = X86ISD::INC;
8814        NumOperands = 1;
8815        break;
8816      }
8817
8818      // An add of negative one (subtract of one) will be selected as a DEC.
8819      if (C->getAPIntValue().isAllOnesValue()) {
8820        Opcode = X86ISD::DEC;
8821        NumOperands = 1;
8822        break;
8823      }
8824    }
8825
8826    // Otherwise use a regular EFLAGS-setting add.
8827    Opcode = X86ISD::ADD;
8828    NumOperands = 2;
8829    break;
8830  case ISD::AND: {
8831    // If the primary and result isn't used, don't bother using X86ISD::AND,
8832    // because a TEST instruction will be better.
8833    bool NonFlagUse = false;
8834    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8835           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
8836      SDNode *User = *UI;
8837      unsigned UOpNo = UI.getOperandNo();
8838      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
8839        // Look pass truncate.
8840        UOpNo = User->use_begin().getOperandNo();
8841        User = *User->use_begin();
8842      }
8843
8844      if (User->getOpcode() != ISD::BRCOND &&
8845          User->getOpcode() != ISD::SETCC &&
8846          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
8847        NonFlagUse = true;
8848        break;
8849      }
8850    }
8851
8852    if (!NonFlagUse)
8853      break;
8854  }
8855    // FALL THROUGH
8856  case ISD::SUB:
8857  case ISD::OR:
8858  case ISD::XOR:
8859    // Due to the ISEL shortcoming noted above, be conservative if this op is
8860    // likely to be selected as part of a load-modify-store instruction.
8861    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8862           UE = Op.getNode()->use_end(); UI != UE; ++UI)
8863      if (UI->getOpcode() == ISD::STORE)
8864        goto default_case;
8865
8866    // Otherwise use a regular EFLAGS-setting instruction.
8867    switch (ArithOp.getOpcode()) {
8868    default: llvm_unreachable("unexpected operator!");
8869    case ISD::SUB: Opcode = X86ISD::SUB; break;
8870    case ISD::XOR: Opcode = X86ISD::XOR; break;
8871    case ISD::AND: Opcode = X86ISD::AND; break;
8872    case ISD::OR: {
8873      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
8874        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
8875        if (EFLAGS.getNode())
8876          return EFLAGS;
8877      }
8878      Opcode = X86ISD::OR;
8879      break;
8880    }
8881    }
8882
8883    NumOperands = 2;
8884    break;
8885  case X86ISD::ADD:
8886  case X86ISD::SUB:
8887  case X86ISD::INC:
8888  case X86ISD::DEC:
8889  case X86ISD::OR:
8890  case X86ISD::XOR:
8891  case X86ISD::AND:
8892    return SDValue(Op.getNode(), 1);
8893  default:
8894  default_case:
8895    break;
8896  }
8897
8898  // If we found that truncation is beneficial, perform the truncation and
8899  // update 'Op'.
8900  if (NeedTruncation) {
8901    EVT VT = Op.getValueType();
8902    SDValue WideVal = Op->getOperand(0);
8903    EVT WideVT = WideVal.getValueType();
8904    unsigned ConvertedOp = 0;
8905    // Use a target machine opcode to prevent further DAGCombine
8906    // optimizations that may separate the arithmetic operations
8907    // from the setcc node.
8908    switch (WideVal.getOpcode()) {
8909      default: break;
8910      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
8911      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
8912      case ISD::AND: ConvertedOp = X86ISD::AND; break;
8913      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
8914      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
8915    }
8916
8917    if (ConvertedOp) {
8918      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8919      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
8920        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
8921        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
8922        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
8923      }
8924    }
8925  }
8926
8927  if (Opcode == 0)
8928    // Emit a CMP with 0, which is the TEST pattern.
8929    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8930                       DAG.getConstant(0, Op.getValueType()));
8931
8932  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
8933  SmallVector<SDValue, 4> Ops;
8934  for (unsigned i = 0; i != NumOperands; ++i)
8935    Ops.push_back(Op.getOperand(i));
8936
8937  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
8938  DAG.ReplaceAllUsesWith(Op, New);
8939  return SDValue(New.getNode(), 1);
8940}
8941
8942/// Emit nodes that will be selected as "cmp Op0,Op1", or something
8943/// equivalent.
8944SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
8945                                   SelectionDAG &DAG) const {
8946  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
8947    if (C->getAPIntValue() == 0)
8948      return EmitTest(Op0, X86CC, DAG);
8949
8950  DebugLoc dl = Op0.getDebugLoc();
8951  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
8952       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
8953    // Use SUB instead of CMP to enable CSE between SUB and CMP.
8954    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
8955    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
8956                              Op0, Op1);
8957    return SDValue(Sub.getNode(), 1);
8958  }
8959  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
8960}
8961
8962/// Convert a comparison if required by the subtarget.
8963SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
8964                                                 SelectionDAG &DAG) const {
8965  // If the subtarget does not support the FUCOMI instruction, floating-point
8966  // comparisons have to be converted.
8967  if (Subtarget->hasCMov() ||
8968      Cmp.getOpcode() != X86ISD::CMP ||
8969      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
8970      !Cmp.getOperand(1).getValueType().isFloatingPoint())
8971    return Cmp;
8972
8973  // The instruction selector will select an FUCOM instruction instead of
8974  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
8975  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
8976  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
8977  DebugLoc dl = Cmp.getDebugLoc();
8978  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
8979  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
8980  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
8981                            DAG.getConstant(8, MVT::i8));
8982  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
8983  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
8984}
8985
8986static bool isAllOnes(SDValue V) {
8987  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8988  return C && C->isAllOnesValue();
8989}
8990
8991/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
8992/// if it's possible.
8993SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
8994                                     DebugLoc dl, SelectionDAG &DAG) const {
8995  SDValue Op0 = And.getOperand(0);
8996  SDValue Op1 = And.getOperand(1);
8997  if (Op0.getOpcode() == ISD::TRUNCATE)
8998    Op0 = Op0.getOperand(0);
8999  if (Op1.getOpcode() == ISD::TRUNCATE)
9000    Op1 = Op1.getOperand(0);
9001
9002  SDValue LHS, RHS;
9003  if (Op1.getOpcode() == ISD::SHL)
9004    std::swap(Op0, Op1);
9005  if (Op0.getOpcode() == ISD::SHL) {
9006    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
9007      if (And00C->getZExtValue() == 1) {
9008        // If we looked past a truncate, check that it's only truncating away
9009        // known zeros.
9010        unsigned BitWidth = Op0.getValueSizeInBits();
9011        unsigned AndBitWidth = And.getValueSizeInBits();
9012        if (BitWidth > AndBitWidth) {
9013          APInt Zeros, Ones;
9014          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
9015          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
9016            return SDValue();
9017        }
9018        LHS = Op1;
9019        RHS = Op0.getOperand(1);
9020      }
9021  } else if (Op1.getOpcode() == ISD::Constant) {
9022    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
9023    uint64_t AndRHSVal = AndRHS->getZExtValue();
9024    SDValue AndLHS = Op0;
9025
9026    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
9027      LHS = AndLHS.getOperand(0);
9028      RHS = AndLHS.getOperand(1);
9029    }
9030
9031    // Use BT if the immediate can't be encoded in a TEST instruction.
9032    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
9033      LHS = AndLHS;
9034      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
9035    }
9036  }
9037
9038  if (LHS.getNode()) {
9039    // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip
9040    // the condition code later.
9041    bool Invert = false;
9042    if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) {
9043      Invert = true;
9044      LHS = LHS.getOperand(0);
9045    }
9046
9047    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
9048    // instruction.  Since the shift amount is in-range-or-undefined, we know
9049    // that doing a bittest on the i32 value is ok.  We extend to i32 because
9050    // the encoding for the i16 version is larger than the i32 version.
9051    // Also promote i16 to i32 for performance / code size reason.
9052    if (LHS.getValueType() == MVT::i8 ||
9053        LHS.getValueType() == MVT::i16)
9054      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
9055
9056    // If the operand types disagree, extend the shift amount to match.  Since
9057    // BT ignores high bits (like shifts) we can use anyextend.
9058    if (LHS.getValueType() != RHS.getValueType())
9059      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
9060
9061    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
9062    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
9063    // Flip the condition if the LHS was a not instruction
9064    if (Invert)
9065      Cond = X86::GetOppositeBranchCondition(Cond);
9066    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9067                       DAG.getConstant(Cond, MVT::i8), BT);
9068  }
9069
9070  return SDValue();
9071}
9072
9073SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9074
9075  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
9076
9077  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
9078  SDValue Op0 = Op.getOperand(0);
9079  SDValue Op1 = Op.getOperand(1);
9080  DebugLoc dl = Op.getDebugLoc();
9081  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
9082
9083  // Optimize to BT if possible.
9084  // Lower (X & (1 << N)) == 0 to BT(X, N).
9085  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
9086  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
9087  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
9088      Op1.getOpcode() == ISD::Constant &&
9089      cast<ConstantSDNode>(Op1)->isNullValue() &&
9090      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9091    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
9092    if (NewSetCC.getNode())
9093      return NewSetCC;
9094  }
9095
9096  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
9097  // these.
9098  if (Op1.getOpcode() == ISD::Constant &&
9099      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
9100       cast<ConstantSDNode>(Op1)->isNullValue()) &&
9101      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9102
9103    // If the input is a setcc, then reuse the input setcc or use a new one with
9104    // the inverted condition.
9105    if (Op0.getOpcode() == X86ISD::SETCC) {
9106      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
9107      bool Invert = (CC == ISD::SETNE) ^
9108        cast<ConstantSDNode>(Op1)->isNullValue();
9109      if (!Invert) return Op0;
9110
9111      CCode = X86::GetOppositeBranchCondition(CCode);
9112      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9113                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
9114    }
9115  }
9116
9117  bool isFP = Op1.getValueType().isFloatingPoint();
9118  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
9119  if (X86CC == X86::COND_INVALID)
9120    return SDValue();
9121
9122  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
9123  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
9124  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9125                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
9126}
9127
9128// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
9129// ones, and then concatenate the result back.
9130static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
9131  EVT VT = Op.getValueType();
9132
9133  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
9134         "Unsupported value type for operation");
9135
9136  unsigned NumElems = VT.getVectorNumElements();
9137  DebugLoc dl = Op.getDebugLoc();
9138  SDValue CC = Op.getOperand(2);
9139
9140  // Extract the LHS vectors
9141  SDValue LHS = Op.getOperand(0);
9142  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
9143  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
9144
9145  // Extract the RHS vectors
9146  SDValue RHS = Op.getOperand(1);
9147  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
9148  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
9149
9150  // Issue the operation on the smaller types and concatenate the result back
9151  MVT EltVT = VT.getVectorElementType().getSimpleVT();
9152  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
9153  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
9154                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
9155                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
9156}
9157
9158SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
9159  SDValue Cond;
9160  SDValue Op0 = Op.getOperand(0);
9161  SDValue Op1 = Op.getOperand(1);
9162  SDValue CC = Op.getOperand(2);
9163  EVT VT = Op.getValueType();
9164  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
9165  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
9166  DebugLoc dl = Op.getDebugLoc();
9167
9168  if (isFP) {
9169#ifndef NDEBUG
9170    EVT EltVT = Op0.getValueType().getVectorElementType();
9171    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
9172#endif
9173
9174    unsigned SSECC;
9175    bool Swap = false;
9176
9177    // SSE Condition code mapping:
9178    //  0 - EQ
9179    //  1 - LT
9180    //  2 - LE
9181    //  3 - UNORD
9182    //  4 - NEQ
9183    //  5 - NLT
9184    //  6 - NLE
9185    //  7 - ORD
9186    switch (SetCCOpcode) {
9187    default: llvm_unreachable("Unexpected SETCC condition");
9188    case ISD::SETOEQ:
9189    case ISD::SETEQ:  SSECC = 0; break;
9190    case ISD::SETOGT:
9191    case ISD::SETGT: Swap = true; // Fallthrough
9192    case ISD::SETLT:
9193    case ISD::SETOLT: SSECC = 1; break;
9194    case ISD::SETOGE:
9195    case ISD::SETGE: Swap = true; // Fallthrough
9196    case ISD::SETLE:
9197    case ISD::SETOLE: SSECC = 2; break;
9198    case ISD::SETUO:  SSECC = 3; break;
9199    case ISD::SETUNE:
9200    case ISD::SETNE:  SSECC = 4; break;
9201    case ISD::SETULE: Swap = true; // Fallthrough
9202    case ISD::SETUGE: SSECC = 5; break;
9203    case ISD::SETULT: Swap = true; // Fallthrough
9204    case ISD::SETUGT: SSECC = 6; break;
9205    case ISD::SETO:   SSECC = 7; break;
9206    case ISD::SETUEQ:
9207    case ISD::SETONE: SSECC = 8; break;
9208    }
9209    if (Swap)
9210      std::swap(Op0, Op1);
9211
9212    // In the two special cases we can't handle, emit two comparisons.
9213    if (SSECC == 8) {
9214      unsigned CC0, CC1;
9215      unsigned CombineOpc;
9216      if (SetCCOpcode == ISD::SETUEQ) {
9217        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
9218      } else {
9219        assert(SetCCOpcode == ISD::SETONE);
9220        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
9221      }
9222
9223      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9224                                 DAG.getConstant(CC0, MVT::i8));
9225      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9226                                 DAG.getConstant(CC1, MVT::i8));
9227      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
9228    }
9229    // Handle all other FP comparisons here.
9230    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
9231                       DAG.getConstant(SSECC, MVT::i8));
9232  }
9233
9234  // Break 256-bit integer vector compare into smaller ones.
9235  if (VT.is256BitVector() && !Subtarget->hasInt256())
9236    return Lower256IntVSETCC(Op, DAG);
9237
9238  // We are handling one of the integer comparisons here.  Since SSE only has
9239  // GT and EQ comparisons for integer, swapping operands and multiple
9240  // operations may be required for some comparisons.
9241  unsigned Opc;
9242  bool Swap = false, Invert = false, FlipSigns = false;
9243
9244  switch (SetCCOpcode) {
9245  default: llvm_unreachable("Unexpected SETCC condition");
9246  case ISD::SETNE:  Invert = true;
9247  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
9248  case ISD::SETLT:  Swap = true;
9249  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
9250  case ISD::SETGE:  Swap = true;
9251  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
9252  case ISD::SETULT: Swap = true;
9253  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
9254  case ISD::SETUGE: Swap = true;
9255  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
9256  }
9257  if (Swap)
9258    std::swap(Op0, Op1);
9259
9260  // Check that the operation in question is available (most are plain SSE2,
9261  // but PCMPGTQ and PCMPEQQ have different requirements).
9262  if (VT == MVT::v2i64) {
9263    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
9264      return SDValue();
9265    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
9266      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
9267      // pcmpeqd + pshufd + pand.
9268      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
9269
9270      // First cast everything to the right type,
9271      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
9272      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
9273
9274      // Do the compare.
9275      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
9276
9277      // Make sure the lower and upper halves are both all-ones.
9278      const int Mask[] = { 1, 0, 3, 2 };
9279      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
9280      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
9281
9282      if (Invert)
9283        Result = DAG.getNOT(dl, Result, MVT::v4i32);
9284
9285      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9286    }
9287  }
9288
9289  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
9290  // bits of the inputs before performing those operations.
9291  if (FlipSigns) {
9292    EVT EltVT = VT.getVectorElementType();
9293    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
9294                                      EltVT);
9295    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
9296    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
9297                                    SignBits.size());
9298    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
9299    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
9300  }
9301
9302  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
9303
9304  // If the logical-not of the result is required, perform that now.
9305  if (Invert)
9306    Result = DAG.getNOT(dl, Result, VT);
9307
9308  return Result;
9309}
9310
9311// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
9312static bool isX86LogicalCmp(SDValue Op) {
9313  unsigned Opc = Op.getNode()->getOpcode();
9314  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
9315      Opc == X86ISD::SAHF)
9316    return true;
9317  if (Op.getResNo() == 1 &&
9318      (Opc == X86ISD::ADD ||
9319       Opc == X86ISD::SUB ||
9320       Opc == X86ISD::ADC ||
9321       Opc == X86ISD::SBB ||
9322       Opc == X86ISD::SMUL ||
9323       Opc == X86ISD::UMUL ||
9324       Opc == X86ISD::INC ||
9325       Opc == X86ISD::DEC ||
9326       Opc == X86ISD::OR ||
9327       Opc == X86ISD::XOR ||
9328       Opc == X86ISD::AND))
9329    return true;
9330
9331  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
9332    return true;
9333
9334  return false;
9335}
9336
9337static bool isZero(SDValue V) {
9338  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
9339  return C && C->isNullValue();
9340}
9341
9342static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
9343  if (V.getOpcode() != ISD::TRUNCATE)
9344    return false;
9345
9346  SDValue VOp0 = V.getOperand(0);
9347  unsigned InBits = VOp0.getValueSizeInBits();
9348  unsigned Bits = V.getValueSizeInBits();
9349  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
9350}
9351
9352SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
9353  bool addTest = true;
9354  SDValue Cond  = Op.getOperand(0);
9355  SDValue Op1 = Op.getOperand(1);
9356  SDValue Op2 = Op.getOperand(2);
9357  DebugLoc DL = Op.getDebugLoc();
9358  SDValue CC;
9359
9360  if (Cond.getOpcode() == ISD::SETCC) {
9361    SDValue NewCond = LowerSETCC(Cond, DAG);
9362    if (NewCond.getNode())
9363      Cond = NewCond;
9364  }
9365
9366  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
9367  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
9368  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
9369  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
9370  if (Cond.getOpcode() == X86ISD::SETCC &&
9371      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
9372      isZero(Cond.getOperand(1).getOperand(1))) {
9373    SDValue Cmp = Cond.getOperand(1);
9374
9375    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
9376
9377    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
9378        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
9379      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
9380
9381      SDValue CmpOp0 = Cmp.getOperand(0);
9382      // Apply further optimizations for special cases
9383      // (select (x != 0), -1, 0) -> neg & sbb
9384      // (select (x == 0), 0, -1) -> neg & sbb
9385      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
9386        if (YC->isNullValue() &&
9387            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
9388          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
9389          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
9390                                    DAG.getConstant(0, CmpOp0.getValueType()),
9391                                    CmpOp0);
9392          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9393                                    DAG.getConstant(X86::COND_B, MVT::i8),
9394                                    SDValue(Neg.getNode(), 1));
9395          return Res;
9396        }
9397
9398      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
9399                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
9400      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9401
9402      SDValue Res =   // Res = 0 or -1.
9403        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9404                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
9405
9406      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
9407        Res = DAG.getNOT(DL, Res, Res.getValueType());
9408
9409      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
9410      if (N2C == 0 || !N2C->isNullValue())
9411        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
9412      return Res;
9413    }
9414  }
9415
9416  // Look past (and (setcc_carry (cmp ...)), 1).
9417  if (Cond.getOpcode() == ISD::AND &&
9418      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9419    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9420    if (C && C->getAPIntValue() == 1)
9421      Cond = Cond.getOperand(0);
9422  }
9423
9424  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9425  // setting operand in place of the X86ISD::SETCC.
9426  unsigned CondOpcode = Cond.getOpcode();
9427  if (CondOpcode == X86ISD::SETCC ||
9428      CondOpcode == X86ISD::SETCC_CARRY) {
9429    CC = Cond.getOperand(0);
9430
9431    SDValue Cmp = Cond.getOperand(1);
9432    unsigned Opc = Cmp.getOpcode();
9433    EVT VT = Op.getValueType();
9434
9435    bool IllegalFPCMov = false;
9436    if (VT.isFloatingPoint() && !VT.isVector() &&
9437        !isScalarFPTypeInSSEReg(VT))  // FPStack?
9438      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
9439
9440    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
9441        Opc == X86ISD::BT) { // FIXME
9442      Cond = Cmp;
9443      addTest = false;
9444    }
9445  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9446             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9447             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9448              Cond.getOperand(0).getValueType() != MVT::i8)) {
9449    SDValue LHS = Cond.getOperand(0);
9450    SDValue RHS = Cond.getOperand(1);
9451    unsigned X86Opcode;
9452    unsigned X86Cond;
9453    SDVTList VTs;
9454    switch (CondOpcode) {
9455    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9456    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9457    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9458    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9459    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9460    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9461    default: llvm_unreachable("unexpected overflowing operator");
9462    }
9463    if (CondOpcode == ISD::UMULO)
9464      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9465                          MVT::i32);
9466    else
9467      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9468
9469    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
9470
9471    if (CondOpcode == ISD::UMULO)
9472      Cond = X86Op.getValue(2);
9473    else
9474      Cond = X86Op.getValue(1);
9475
9476    CC = DAG.getConstant(X86Cond, MVT::i8);
9477    addTest = false;
9478  }
9479
9480  if (addTest) {
9481    // Look pass the truncate if the high bits are known zero.
9482    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9483        Cond = Cond.getOperand(0);
9484
9485    // We know the result of AND is compared against zero. Try to match
9486    // it to BT.
9487    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9488      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
9489      if (NewSetCC.getNode()) {
9490        CC = NewSetCC.getOperand(0);
9491        Cond = NewSetCC.getOperand(1);
9492        addTest = false;
9493      }
9494    }
9495  }
9496
9497  if (addTest) {
9498    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9499    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9500  }
9501
9502  // a <  b ? -1 :  0 -> RES = ~setcc_carry
9503  // a <  b ?  0 : -1 -> RES = setcc_carry
9504  // a >= b ? -1 :  0 -> RES = setcc_carry
9505  // a >= b ?  0 : -1 -> RES = ~setcc_carry
9506  if (Cond.getOpcode() == X86ISD::SUB) {
9507    Cond = ConvertCmpIfNecessary(Cond, DAG);
9508    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
9509
9510    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
9511        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
9512      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9513                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
9514      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
9515        return DAG.getNOT(DL, Res, Res.getValueType());
9516      return Res;
9517    }
9518  }
9519
9520  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
9521  // widen the cmov and push the truncate through. This avoids introducing a new
9522  // branch during isel and doesn't add any extensions.
9523  if (Op.getValueType() == MVT::i8 &&
9524      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
9525    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
9526    if (T1.getValueType() == T2.getValueType() &&
9527        // Blacklist CopyFromReg to avoid partial register stalls.
9528        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
9529      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
9530      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
9531      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
9532    }
9533  }
9534
9535  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
9536  // condition is true.
9537  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
9538  SDValue Ops[] = { Op2, Op1, CC, Cond };
9539  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
9540}
9541
9542// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
9543// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
9544// from the AND / OR.
9545static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
9546  Opc = Op.getOpcode();
9547  if (Opc != ISD::OR && Opc != ISD::AND)
9548    return false;
9549  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9550          Op.getOperand(0).hasOneUse() &&
9551          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
9552          Op.getOperand(1).hasOneUse());
9553}
9554
9555// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
9556// 1 and that the SETCC node has a single use.
9557static bool isXor1OfSetCC(SDValue Op) {
9558  if (Op.getOpcode() != ISD::XOR)
9559    return false;
9560  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9561  if (N1C && N1C->getAPIntValue() == 1) {
9562    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9563      Op.getOperand(0).hasOneUse();
9564  }
9565  return false;
9566}
9567
9568SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
9569  bool addTest = true;
9570  SDValue Chain = Op.getOperand(0);
9571  SDValue Cond  = Op.getOperand(1);
9572  SDValue Dest  = Op.getOperand(2);
9573  DebugLoc dl = Op.getDebugLoc();
9574  SDValue CC;
9575  bool Inverted = false;
9576
9577  if (Cond.getOpcode() == ISD::SETCC) {
9578    // Check for setcc([su]{add,sub,mul}o == 0).
9579    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
9580        isa<ConstantSDNode>(Cond.getOperand(1)) &&
9581        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
9582        Cond.getOperand(0).getResNo() == 1 &&
9583        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
9584         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
9585         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
9586         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
9587         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
9588         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
9589      Inverted = true;
9590      Cond = Cond.getOperand(0);
9591    } else {
9592      SDValue NewCond = LowerSETCC(Cond, DAG);
9593      if (NewCond.getNode())
9594        Cond = NewCond;
9595    }
9596  }
9597#if 0
9598  // FIXME: LowerXALUO doesn't handle these!!
9599  else if (Cond.getOpcode() == X86ISD::ADD  ||
9600           Cond.getOpcode() == X86ISD::SUB  ||
9601           Cond.getOpcode() == X86ISD::SMUL ||
9602           Cond.getOpcode() == X86ISD::UMUL)
9603    Cond = LowerXALUO(Cond, DAG);
9604#endif
9605
9606  // Look pass (and (setcc_carry (cmp ...)), 1).
9607  if (Cond.getOpcode() == ISD::AND &&
9608      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9609    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9610    if (C && C->getAPIntValue() == 1)
9611      Cond = Cond.getOperand(0);
9612  }
9613
9614  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9615  // setting operand in place of the X86ISD::SETCC.
9616  unsigned CondOpcode = Cond.getOpcode();
9617  if (CondOpcode == X86ISD::SETCC ||
9618      CondOpcode == X86ISD::SETCC_CARRY) {
9619    CC = Cond.getOperand(0);
9620
9621    SDValue Cmp = Cond.getOperand(1);
9622    unsigned Opc = Cmp.getOpcode();
9623    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
9624    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
9625      Cond = Cmp;
9626      addTest = false;
9627    } else {
9628      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
9629      default: break;
9630      case X86::COND_O:
9631      case X86::COND_B:
9632        // These can only come from an arithmetic instruction with overflow,
9633        // e.g. SADDO, UADDO.
9634        Cond = Cond.getNode()->getOperand(1);
9635        addTest = false;
9636        break;
9637      }
9638    }
9639  }
9640  CondOpcode = Cond.getOpcode();
9641  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9642      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9643      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9644       Cond.getOperand(0).getValueType() != MVT::i8)) {
9645    SDValue LHS = Cond.getOperand(0);
9646    SDValue RHS = Cond.getOperand(1);
9647    unsigned X86Opcode;
9648    unsigned X86Cond;
9649    SDVTList VTs;
9650    switch (CondOpcode) {
9651    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9652    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9653    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9654    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9655    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9656    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9657    default: llvm_unreachable("unexpected overflowing operator");
9658    }
9659    if (Inverted)
9660      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
9661    if (CondOpcode == ISD::UMULO)
9662      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9663                          MVT::i32);
9664    else
9665      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9666
9667    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
9668
9669    if (CondOpcode == ISD::UMULO)
9670      Cond = X86Op.getValue(2);
9671    else
9672      Cond = X86Op.getValue(1);
9673
9674    CC = DAG.getConstant(X86Cond, MVT::i8);
9675    addTest = false;
9676  } else {
9677    unsigned CondOpc;
9678    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
9679      SDValue Cmp = Cond.getOperand(0).getOperand(1);
9680      if (CondOpc == ISD::OR) {
9681        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
9682        // two branches instead of an explicit OR instruction with a
9683        // separate test.
9684        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9685            isX86LogicalCmp(Cmp)) {
9686          CC = Cond.getOperand(0).getOperand(0);
9687          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9688                              Chain, Dest, CC, Cmp);
9689          CC = Cond.getOperand(1).getOperand(0);
9690          Cond = Cmp;
9691          addTest = false;
9692        }
9693      } else { // ISD::AND
9694        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
9695        // two branches instead of an explicit AND instruction with a
9696        // separate test. However, we only do this if this block doesn't
9697        // have a fall-through edge, because this requires an explicit
9698        // jmp when the condition is false.
9699        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9700            isX86LogicalCmp(Cmp) &&
9701            Op.getNode()->hasOneUse()) {
9702          X86::CondCode CCode =
9703            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9704          CCode = X86::GetOppositeBranchCondition(CCode);
9705          CC = DAG.getConstant(CCode, MVT::i8);
9706          SDNode *User = *Op.getNode()->use_begin();
9707          // Look for an unconditional branch following this conditional branch.
9708          // We need this because we need to reverse the successors in order
9709          // to implement FCMP_OEQ.
9710          if (User->getOpcode() == ISD::BR) {
9711            SDValue FalseBB = User->getOperand(1);
9712            SDNode *NewBR =
9713              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9714            assert(NewBR == User);
9715            (void)NewBR;
9716            Dest = FalseBB;
9717
9718            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9719                                Chain, Dest, CC, Cmp);
9720            X86::CondCode CCode =
9721              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
9722            CCode = X86::GetOppositeBranchCondition(CCode);
9723            CC = DAG.getConstant(CCode, MVT::i8);
9724            Cond = Cmp;
9725            addTest = false;
9726          }
9727        }
9728      }
9729    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
9730      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
9731      // It should be transformed during dag combiner except when the condition
9732      // is set by a arithmetics with overflow node.
9733      X86::CondCode CCode =
9734        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9735      CCode = X86::GetOppositeBranchCondition(CCode);
9736      CC = DAG.getConstant(CCode, MVT::i8);
9737      Cond = Cond.getOperand(0).getOperand(1);
9738      addTest = false;
9739    } else if (Cond.getOpcode() == ISD::SETCC &&
9740               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
9741      // For FCMP_OEQ, we can emit
9742      // two branches instead of an explicit AND instruction with a
9743      // separate test. However, we only do this if this block doesn't
9744      // have a fall-through edge, because this requires an explicit
9745      // jmp when the condition is false.
9746      if (Op.getNode()->hasOneUse()) {
9747        SDNode *User = *Op.getNode()->use_begin();
9748        // Look for an unconditional branch following this conditional branch.
9749        // We need this because we need to reverse the successors in order
9750        // to implement FCMP_OEQ.
9751        if (User->getOpcode() == ISD::BR) {
9752          SDValue FalseBB = User->getOperand(1);
9753          SDNode *NewBR =
9754            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9755          assert(NewBR == User);
9756          (void)NewBR;
9757          Dest = FalseBB;
9758
9759          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9760                                    Cond.getOperand(0), Cond.getOperand(1));
9761          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9762          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9763          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9764                              Chain, Dest, CC, Cmp);
9765          CC = DAG.getConstant(X86::COND_P, MVT::i8);
9766          Cond = Cmp;
9767          addTest = false;
9768        }
9769      }
9770    } else if (Cond.getOpcode() == ISD::SETCC &&
9771               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
9772      // For FCMP_UNE, we can emit
9773      // two branches instead of an explicit AND instruction with a
9774      // separate test. However, we only do this if this block doesn't
9775      // have a fall-through edge, because this requires an explicit
9776      // jmp when the condition is false.
9777      if (Op.getNode()->hasOneUse()) {
9778        SDNode *User = *Op.getNode()->use_begin();
9779        // Look for an unconditional branch following this conditional branch.
9780        // We need this because we need to reverse the successors in order
9781        // to implement FCMP_UNE.
9782        if (User->getOpcode() == ISD::BR) {
9783          SDValue FalseBB = User->getOperand(1);
9784          SDNode *NewBR =
9785            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9786          assert(NewBR == User);
9787          (void)NewBR;
9788
9789          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9790                                    Cond.getOperand(0), Cond.getOperand(1));
9791          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9792          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9793          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9794                              Chain, Dest, CC, Cmp);
9795          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
9796          Cond = Cmp;
9797          addTest = false;
9798          Dest = FalseBB;
9799        }
9800      }
9801    }
9802  }
9803
9804  if (addTest) {
9805    // Look pass the truncate if the high bits are known zero.
9806    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9807        Cond = Cond.getOperand(0);
9808
9809    // We know the result of AND is compared against zero. Try to match
9810    // it to BT.
9811    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9812      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
9813      if (NewSetCC.getNode()) {
9814        CC = NewSetCC.getOperand(0);
9815        Cond = NewSetCC.getOperand(1);
9816        addTest = false;
9817      }
9818    }
9819  }
9820
9821  if (addTest) {
9822    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9823    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9824  }
9825  Cond = ConvertCmpIfNecessary(Cond, DAG);
9826  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9827                     Chain, Dest, CC, Cond);
9828}
9829
9830// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
9831// Calls to _alloca is needed to probe the stack when allocating more than 4k
9832// bytes in one go. Touching the stack at 4K increments is necessary to ensure
9833// that the guard pages used by the OS virtual memory manager are allocated in
9834// correct sequence.
9835SDValue
9836X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
9837                                           SelectionDAG &DAG) const {
9838  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
9839          getTargetMachine().Options.EnableSegmentedStacks) &&
9840         "This should be used only on Windows targets or when segmented stacks "
9841         "are being used");
9842  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
9843  DebugLoc dl = Op.getDebugLoc();
9844
9845  // Get the inputs.
9846  SDValue Chain = Op.getOperand(0);
9847  SDValue Size  = Op.getOperand(1);
9848  // FIXME: Ensure alignment here
9849
9850  bool Is64Bit = Subtarget->is64Bit();
9851  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
9852
9853  if (getTargetMachine().Options.EnableSegmentedStacks) {
9854    MachineFunction &MF = DAG.getMachineFunction();
9855    MachineRegisterInfo &MRI = MF.getRegInfo();
9856
9857    if (Is64Bit) {
9858      // The 64 bit implementation of segmented stacks needs to clobber both r10
9859      // r11. This makes it impossible to use it along with nested parameters.
9860      const Function *F = MF.getFunction();
9861
9862      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
9863           I != E; ++I)
9864        if (I->hasNestAttr())
9865          report_fatal_error("Cannot use segmented stacks with functions that "
9866                             "have nested arguments.");
9867    }
9868
9869    const TargetRegisterClass *AddrRegClass =
9870      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
9871    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
9872    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
9873    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
9874                                DAG.getRegister(Vreg, SPTy));
9875    SDValue Ops1[2] = { Value, Chain };
9876    return DAG.getMergeValues(Ops1, 2, dl);
9877  } else {
9878    SDValue Flag;
9879    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
9880
9881    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
9882    Flag = Chain.getValue(1);
9883    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9884
9885    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
9886    Flag = Chain.getValue(1);
9887
9888    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
9889                               SPTy).getValue(1);
9890
9891    SDValue Ops1[2] = { Chain.getValue(0), Chain };
9892    return DAG.getMergeValues(Ops1, 2, dl);
9893  }
9894}
9895
9896SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
9897  MachineFunction &MF = DAG.getMachineFunction();
9898  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
9899
9900  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9901  DebugLoc DL = Op.getDebugLoc();
9902
9903  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
9904    // vastart just stores the address of the VarArgsFrameIndex slot into the
9905    // memory location argument.
9906    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9907                                   getPointerTy());
9908    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9909                        MachinePointerInfo(SV), false, false, 0);
9910  }
9911
9912  // __va_list_tag:
9913  //   gp_offset         (0 - 6 * 8)
9914  //   fp_offset         (48 - 48 + 8 * 16)
9915  //   overflow_arg_area (point to parameters coming in memory).
9916  //   reg_save_area
9917  SmallVector<SDValue, 8> MemOps;
9918  SDValue FIN = Op.getOperand(1);
9919  // Store gp_offset
9920  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
9921                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
9922                                               MVT::i32),
9923                               FIN, MachinePointerInfo(SV), false, false, 0);
9924  MemOps.push_back(Store);
9925
9926  // Store fp_offset
9927  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9928                    FIN, DAG.getIntPtrConstant(4));
9929  Store = DAG.getStore(Op.getOperand(0), DL,
9930                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
9931                                       MVT::i32),
9932                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
9933  MemOps.push_back(Store);
9934
9935  // Store ptr to overflow_arg_area
9936  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9937                    FIN, DAG.getIntPtrConstant(4));
9938  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9939                                    getPointerTy());
9940  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
9941                       MachinePointerInfo(SV, 8),
9942                       false, false, 0);
9943  MemOps.push_back(Store);
9944
9945  // Store ptr to reg_save_area.
9946  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9947                    FIN, DAG.getIntPtrConstant(8));
9948  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
9949                                    getPointerTy());
9950  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
9951                       MachinePointerInfo(SV, 16), false, false, 0);
9952  MemOps.push_back(Store);
9953  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9954                     &MemOps[0], MemOps.size());
9955}
9956
9957SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9958  assert(Subtarget->is64Bit() &&
9959         "LowerVAARG only handles 64-bit va_arg!");
9960  assert((Subtarget->isTargetLinux() ||
9961          Subtarget->isTargetDarwin()) &&
9962          "Unhandled target in LowerVAARG");
9963  assert(Op.getNode()->getNumOperands() == 4);
9964  SDValue Chain = Op.getOperand(0);
9965  SDValue SrcPtr = Op.getOperand(1);
9966  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9967  unsigned Align = Op.getConstantOperandVal(3);
9968  DebugLoc dl = Op.getDebugLoc();
9969
9970  EVT ArgVT = Op.getNode()->getValueType(0);
9971  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9972  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
9973  uint8_t ArgMode;
9974
9975  // Decide which area this value should be read from.
9976  // TODO: Implement the AMD64 ABI in its entirety. This simple
9977  // selection mechanism works only for the basic types.
9978  if (ArgVT == MVT::f80) {
9979    llvm_unreachable("va_arg for f80 not yet implemented");
9980  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
9981    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
9982  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
9983    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
9984  } else {
9985    llvm_unreachable("Unhandled argument type in LowerVAARG");
9986  }
9987
9988  if (ArgMode == 2) {
9989    // Sanity Check: Make sure using fp_offset makes sense.
9990    assert(!getTargetMachine().Options.UseSoftFloat &&
9991           !(DAG.getMachineFunction()
9992                .getFunction()->getFnAttributes()
9993                .hasAttribute(Attribute::NoImplicitFloat)) &&
9994           Subtarget->hasSSE1());
9995  }
9996
9997  // Insert VAARG_64 node into the DAG
9998  // VAARG_64 returns two values: Variable Argument Address, Chain
9999  SmallVector<SDValue, 11> InstOps;
10000  InstOps.push_back(Chain);
10001  InstOps.push_back(SrcPtr);
10002  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
10003  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
10004  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
10005  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
10006  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
10007                                          VTs, &InstOps[0], InstOps.size(),
10008                                          MVT::i64,
10009                                          MachinePointerInfo(SV),
10010                                          /*Align=*/0,
10011                                          /*Volatile=*/false,
10012                                          /*ReadMem=*/true,
10013                                          /*WriteMem=*/true);
10014  Chain = VAARG.getValue(1);
10015
10016  // Load the next argument and return it
10017  return DAG.getLoad(ArgVT, dl,
10018                     Chain,
10019                     VAARG,
10020                     MachinePointerInfo(),
10021                     false, false, false, 0);
10022}
10023
10024static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
10025                           SelectionDAG &DAG) {
10026  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
10027  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
10028  SDValue Chain = Op.getOperand(0);
10029  SDValue DstPtr = Op.getOperand(1);
10030  SDValue SrcPtr = Op.getOperand(2);
10031  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10032  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10033  DebugLoc DL = Op.getDebugLoc();
10034
10035  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
10036                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
10037                       false,
10038                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
10039}
10040
10041// getTargetVShiftNOde - Handle vector element shifts where the shift amount
10042// may or may not be a constant. Takes immediate version of shift as input.
10043static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
10044                                   SDValue SrcOp, SDValue ShAmt,
10045                                   SelectionDAG &DAG) {
10046  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
10047
10048  if (isa<ConstantSDNode>(ShAmt)) {
10049    // Constant may be a TargetConstant. Use a regular constant.
10050    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
10051    switch (Opc) {
10052      default: llvm_unreachable("Unknown target vector shift node");
10053      case X86ISD::VSHLI:
10054      case X86ISD::VSRLI:
10055      case X86ISD::VSRAI:
10056        return DAG.getNode(Opc, dl, VT, SrcOp,
10057                           DAG.getConstant(ShiftAmt, MVT::i32));
10058    }
10059  }
10060
10061  // Change opcode to non-immediate version
10062  switch (Opc) {
10063    default: llvm_unreachable("Unknown target vector shift node");
10064    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
10065    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
10066    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
10067  }
10068
10069  // Need to build a vector containing shift amount
10070  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
10071  SDValue ShOps[4];
10072  ShOps[0] = ShAmt;
10073  ShOps[1] = DAG.getConstant(0, MVT::i32);
10074  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
10075  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
10076
10077  // The return type has to be a 128-bit type with the same element
10078  // type as the input type.
10079  MVT EltVT = VT.getVectorElementType().getSimpleVT();
10080  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
10081
10082  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
10083  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
10084}
10085
10086static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
10087  DebugLoc dl = Op.getDebugLoc();
10088  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10089  switch (IntNo) {
10090  default: return SDValue();    // Don't custom lower most intrinsics.
10091  // Comparison intrinsics.
10092  case Intrinsic::x86_sse_comieq_ss:
10093  case Intrinsic::x86_sse_comilt_ss:
10094  case Intrinsic::x86_sse_comile_ss:
10095  case Intrinsic::x86_sse_comigt_ss:
10096  case Intrinsic::x86_sse_comige_ss:
10097  case Intrinsic::x86_sse_comineq_ss:
10098  case Intrinsic::x86_sse_ucomieq_ss:
10099  case Intrinsic::x86_sse_ucomilt_ss:
10100  case Intrinsic::x86_sse_ucomile_ss:
10101  case Intrinsic::x86_sse_ucomigt_ss:
10102  case Intrinsic::x86_sse_ucomige_ss:
10103  case Intrinsic::x86_sse_ucomineq_ss:
10104  case Intrinsic::x86_sse2_comieq_sd:
10105  case Intrinsic::x86_sse2_comilt_sd:
10106  case Intrinsic::x86_sse2_comile_sd:
10107  case Intrinsic::x86_sse2_comigt_sd:
10108  case Intrinsic::x86_sse2_comige_sd:
10109  case Intrinsic::x86_sse2_comineq_sd:
10110  case Intrinsic::x86_sse2_ucomieq_sd:
10111  case Intrinsic::x86_sse2_ucomilt_sd:
10112  case Intrinsic::x86_sse2_ucomile_sd:
10113  case Intrinsic::x86_sse2_ucomigt_sd:
10114  case Intrinsic::x86_sse2_ucomige_sd:
10115  case Intrinsic::x86_sse2_ucomineq_sd: {
10116    unsigned Opc;
10117    ISD::CondCode CC;
10118    switch (IntNo) {
10119    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10120    case Intrinsic::x86_sse_comieq_ss:
10121    case Intrinsic::x86_sse2_comieq_sd:
10122      Opc = X86ISD::COMI;
10123      CC = ISD::SETEQ;
10124      break;
10125    case Intrinsic::x86_sse_comilt_ss:
10126    case Intrinsic::x86_sse2_comilt_sd:
10127      Opc = X86ISD::COMI;
10128      CC = ISD::SETLT;
10129      break;
10130    case Intrinsic::x86_sse_comile_ss:
10131    case Intrinsic::x86_sse2_comile_sd:
10132      Opc = X86ISD::COMI;
10133      CC = ISD::SETLE;
10134      break;
10135    case Intrinsic::x86_sse_comigt_ss:
10136    case Intrinsic::x86_sse2_comigt_sd:
10137      Opc = X86ISD::COMI;
10138      CC = ISD::SETGT;
10139      break;
10140    case Intrinsic::x86_sse_comige_ss:
10141    case Intrinsic::x86_sse2_comige_sd:
10142      Opc = X86ISD::COMI;
10143      CC = ISD::SETGE;
10144      break;
10145    case Intrinsic::x86_sse_comineq_ss:
10146    case Intrinsic::x86_sse2_comineq_sd:
10147      Opc = X86ISD::COMI;
10148      CC = ISD::SETNE;
10149      break;
10150    case Intrinsic::x86_sse_ucomieq_ss:
10151    case Intrinsic::x86_sse2_ucomieq_sd:
10152      Opc = X86ISD::UCOMI;
10153      CC = ISD::SETEQ;
10154      break;
10155    case Intrinsic::x86_sse_ucomilt_ss:
10156    case Intrinsic::x86_sse2_ucomilt_sd:
10157      Opc = X86ISD::UCOMI;
10158      CC = ISD::SETLT;
10159      break;
10160    case Intrinsic::x86_sse_ucomile_ss:
10161    case Intrinsic::x86_sse2_ucomile_sd:
10162      Opc = X86ISD::UCOMI;
10163      CC = ISD::SETLE;
10164      break;
10165    case Intrinsic::x86_sse_ucomigt_ss:
10166    case Intrinsic::x86_sse2_ucomigt_sd:
10167      Opc = X86ISD::UCOMI;
10168      CC = ISD::SETGT;
10169      break;
10170    case Intrinsic::x86_sse_ucomige_ss:
10171    case Intrinsic::x86_sse2_ucomige_sd:
10172      Opc = X86ISD::UCOMI;
10173      CC = ISD::SETGE;
10174      break;
10175    case Intrinsic::x86_sse_ucomineq_ss:
10176    case Intrinsic::x86_sse2_ucomineq_sd:
10177      Opc = X86ISD::UCOMI;
10178      CC = ISD::SETNE;
10179      break;
10180    }
10181
10182    SDValue LHS = Op.getOperand(1);
10183    SDValue RHS = Op.getOperand(2);
10184    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
10185    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
10186    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
10187    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10188                                DAG.getConstant(X86CC, MVT::i8), Cond);
10189    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10190  }
10191
10192  // Arithmetic intrinsics.
10193  case Intrinsic::x86_sse2_pmulu_dq:
10194  case Intrinsic::x86_avx2_pmulu_dq:
10195    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
10196                       Op.getOperand(1), Op.getOperand(2));
10197
10198  // SSE2/AVX2 sub with unsigned saturation intrinsics
10199  case Intrinsic::x86_sse2_psubus_b:
10200  case Intrinsic::x86_sse2_psubus_w:
10201  case Intrinsic::x86_avx2_psubus_b:
10202  case Intrinsic::x86_avx2_psubus_w:
10203    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
10204                       Op.getOperand(1), Op.getOperand(2));
10205
10206  // SSE3/AVX horizontal add/sub intrinsics
10207  case Intrinsic::x86_sse3_hadd_ps:
10208  case Intrinsic::x86_sse3_hadd_pd:
10209  case Intrinsic::x86_avx_hadd_ps_256:
10210  case Intrinsic::x86_avx_hadd_pd_256:
10211  case Intrinsic::x86_sse3_hsub_ps:
10212  case Intrinsic::x86_sse3_hsub_pd:
10213  case Intrinsic::x86_avx_hsub_ps_256:
10214  case Intrinsic::x86_avx_hsub_pd_256:
10215  case Intrinsic::x86_ssse3_phadd_w_128:
10216  case Intrinsic::x86_ssse3_phadd_d_128:
10217  case Intrinsic::x86_avx2_phadd_w:
10218  case Intrinsic::x86_avx2_phadd_d:
10219  case Intrinsic::x86_ssse3_phsub_w_128:
10220  case Intrinsic::x86_ssse3_phsub_d_128:
10221  case Intrinsic::x86_avx2_phsub_w:
10222  case Intrinsic::x86_avx2_phsub_d: {
10223    unsigned Opcode;
10224    switch (IntNo) {
10225    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10226    case Intrinsic::x86_sse3_hadd_ps:
10227    case Intrinsic::x86_sse3_hadd_pd:
10228    case Intrinsic::x86_avx_hadd_ps_256:
10229    case Intrinsic::x86_avx_hadd_pd_256:
10230      Opcode = X86ISD::FHADD;
10231      break;
10232    case Intrinsic::x86_sse3_hsub_ps:
10233    case Intrinsic::x86_sse3_hsub_pd:
10234    case Intrinsic::x86_avx_hsub_ps_256:
10235    case Intrinsic::x86_avx_hsub_pd_256:
10236      Opcode = X86ISD::FHSUB;
10237      break;
10238    case Intrinsic::x86_ssse3_phadd_w_128:
10239    case Intrinsic::x86_ssse3_phadd_d_128:
10240    case Intrinsic::x86_avx2_phadd_w:
10241    case Intrinsic::x86_avx2_phadd_d:
10242      Opcode = X86ISD::HADD;
10243      break;
10244    case Intrinsic::x86_ssse3_phsub_w_128:
10245    case Intrinsic::x86_ssse3_phsub_d_128:
10246    case Intrinsic::x86_avx2_phsub_w:
10247    case Intrinsic::x86_avx2_phsub_d:
10248      Opcode = X86ISD::HSUB;
10249      break;
10250    }
10251    return DAG.getNode(Opcode, dl, Op.getValueType(),
10252                       Op.getOperand(1), Op.getOperand(2));
10253  }
10254
10255  // SSE2/SSE41/AVX2 integer max/min intrinsics.
10256  case Intrinsic::x86_sse2_pmaxu_b:
10257  case Intrinsic::x86_sse41_pmaxuw:
10258  case Intrinsic::x86_sse41_pmaxud:
10259  case Intrinsic::x86_avx2_pmaxu_b:
10260  case Intrinsic::x86_avx2_pmaxu_w:
10261  case Intrinsic::x86_avx2_pmaxu_d:
10262    return DAG.getNode(X86ISD::UMAX, dl, Op.getValueType(),
10263                       Op.getOperand(1), Op.getOperand(2));
10264  case Intrinsic::x86_sse2_pminu_b:
10265  case Intrinsic::x86_sse41_pminuw:
10266  case Intrinsic::x86_sse41_pminud:
10267  case Intrinsic::x86_avx2_pminu_b:
10268  case Intrinsic::x86_avx2_pminu_w:
10269  case Intrinsic::x86_avx2_pminu_d:
10270    return DAG.getNode(X86ISD::UMIN, dl, Op.getValueType(),
10271                       Op.getOperand(1), Op.getOperand(2));
10272  case Intrinsic::x86_sse41_pmaxsb:
10273  case Intrinsic::x86_sse2_pmaxs_w:
10274  case Intrinsic::x86_sse41_pmaxsd:
10275  case Intrinsic::x86_avx2_pmaxs_b:
10276  case Intrinsic::x86_avx2_pmaxs_w:
10277  case Intrinsic::x86_avx2_pmaxs_d:
10278    return DAG.getNode(X86ISD::SMAX, dl, Op.getValueType(),
10279                       Op.getOperand(1), Op.getOperand(2));
10280  case Intrinsic::x86_sse41_pminsb:
10281  case Intrinsic::x86_sse2_pmins_w:
10282  case Intrinsic::x86_sse41_pminsd:
10283  case Intrinsic::x86_avx2_pmins_b:
10284  case Intrinsic::x86_avx2_pmins_w:
10285  case Intrinsic::x86_avx2_pmins_d:
10286    return DAG.getNode(X86ISD::SMIN, dl, Op.getValueType(),
10287                       Op.getOperand(1), Op.getOperand(2));
10288
10289  // AVX2 variable shift intrinsics
10290  case Intrinsic::x86_avx2_psllv_d:
10291  case Intrinsic::x86_avx2_psllv_q:
10292  case Intrinsic::x86_avx2_psllv_d_256:
10293  case Intrinsic::x86_avx2_psllv_q_256:
10294  case Intrinsic::x86_avx2_psrlv_d:
10295  case Intrinsic::x86_avx2_psrlv_q:
10296  case Intrinsic::x86_avx2_psrlv_d_256:
10297  case Intrinsic::x86_avx2_psrlv_q_256:
10298  case Intrinsic::x86_avx2_psrav_d:
10299  case Intrinsic::x86_avx2_psrav_d_256: {
10300    unsigned Opcode;
10301    switch (IntNo) {
10302    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10303    case Intrinsic::x86_avx2_psllv_d:
10304    case Intrinsic::x86_avx2_psllv_q:
10305    case Intrinsic::x86_avx2_psllv_d_256:
10306    case Intrinsic::x86_avx2_psllv_q_256:
10307      Opcode = ISD::SHL;
10308      break;
10309    case Intrinsic::x86_avx2_psrlv_d:
10310    case Intrinsic::x86_avx2_psrlv_q:
10311    case Intrinsic::x86_avx2_psrlv_d_256:
10312    case Intrinsic::x86_avx2_psrlv_q_256:
10313      Opcode = ISD::SRL;
10314      break;
10315    case Intrinsic::x86_avx2_psrav_d:
10316    case Intrinsic::x86_avx2_psrav_d_256:
10317      Opcode = ISD::SRA;
10318      break;
10319    }
10320    return DAG.getNode(Opcode, dl, Op.getValueType(),
10321                       Op.getOperand(1), Op.getOperand(2));
10322  }
10323
10324  case Intrinsic::x86_ssse3_pshuf_b_128:
10325  case Intrinsic::x86_avx2_pshuf_b:
10326    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
10327                       Op.getOperand(1), Op.getOperand(2));
10328
10329  case Intrinsic::x86_ssse3_psign_b_128:
10330  case Intrinsic::x86_ssse3_psign_w_128:
10331  case Intrinsic::x86_ssse3_psign_d_128:
10332  case Intrinsic::x86_avx2_psign_b:
10333  case Intrinsic::x86_avx2_psign_w:
10334  case Intrinsic::x86_avx2_psign_d:
10335    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
10336                       Op.getOperand(1), Op.getOperand(2));
10337
10338  case Intrinsic::x86_sse41_insertps:
10339    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
10340                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
10341
10342  case Intrinsic::x86_avx_vperm2f128_ps_256:
10343  case Intrinsic::x86_avx_vperm2f128_pd_256:
10344  case Intrinsic::x86_avx_vperm2f128_si_256:
10345  case Intrinsic::x86_avx2_vperm2i128:
10346    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
10347                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
10348
10349  case Intrinsic::x86_avx2_permd:
10350  case Intrinsic::x86_avx2_permps:
10351    // Operands intentionally swapped. Mask is last operand to intrinsic,
10352    // but second operand for node/intruction.
10353    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
10354                       Op.getOperand(2), Op.getOperand(1));
10355
10356  // ptest and testp intrinsics. The intrinsic these come from are designed to
10357  // return an integer value, not just an instruction so lower it to the ptest
10358  // or testp pattern and a setcc for the result.
10359  case Intrinsic::x86_sse41_ptestz:
10360  case Intrinsic::x86_sse41_ptestc:
10361  case Intrinsic::x86_sse41_ptestnzc:
10362  case Intrinsic::x86_avx_ptestz_256:
10363  case Intrinsic::x86_avx_ptestc_256:
10364  case Intrinsic::x86_avx_ptestnzc_256:
10365  case Intrinsic::x86_avx_vtestz_ps:
10366  case Intrinsic::x86_avx_vtestc_ps:
10367  case Intrinsic::x86_avx_vtestnzc_ps:
10368  case Intrinsic::x86_avx_vtestz_pd:
10369  case Intrinsic::x86_avx_vtestc_pd:
10370  case Intrinsic::x86_avx_vtestnzc_pd:
10371  case Intrinsic::x86_avx_vtestz_ps_256:
10372  case Intrinsic::x86_avx_vtestc_ps_256:
10373  case Intrinsic::x86_avx_vtestnzc_ps_256:
10374  case Intrinsic::x86_avx_vtestz_pd_256:
10375  case Intrinsic::x86_avx_vtestc_pd_256:
10376  case Intrinsic::x86_avx_vtestnzc_pd_256: {
10377    bool IsTestPacked = false;
10378    unsigned X86CC;
10379    switch (IntNo) {
10380    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
10381    case Intrinsic::x86_avx_vtestz_ps:
10382    case Intrinsic::x86_avx_vtestz_pd:
10383    case Intrinsic::x86_avx_vtestz_ps_256:
10384    case Intrinsic::x86_avx_vtestz_pd_256:
10385      IsTestPacked = true; // Fallthrough
10386    case Intrinsic::x86_sse41_ptestz:
10387    case Intrinsic::x86_avx_ptestz_256:
10388      // ZF = 1
10389      X86CC = X86::COND_E;
10390      break;
10391    case Intrinsic::x86_avx_vtestc_ps:
10392    case Intrinsic::x86_avx_vtestc_pd:
10393    case Intrinsic::x86_avx_vtestc_ps_256:
10394    case Intrinsic::x86_avx_vtestc_pd_256:
10395      IsTestPacked = true; // Fallthrough
10396    case Intrinsic::x86_sse41_ptestc:
10397    case Intrinsic::x86_avx_ptestc_256:
10398      // CF = 1
10399      X86CC = X86::COND_B;
10400      break;
10401    case Intrinsic::x86_avx_vtestnzc_ps:
10402    case Intrinsic::x86_avx_vtestnzc_pd:
10403    case Intrinsic::x86_avx_vtestnzc_ps_256:
10404    case Intrinsic::x86_avx_vtestnzc_pd_256:
10405      IsTestPacked = true; // Fallthrough
10406    case Intrinsic::x86_sse41_ptestnzc:
10407    case Intrinsic::x86_avx_ptestnzc_256:
10408      // ZF and CF = 0
10409      X86CC = X86::COND_A;
10410      break;
10411    }
10412
10413    SDValue LHS = Op.getOperand(1);
10414    SDValue RHS = Op.getOperand(2);
10415    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
10416    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
10417    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
10418    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
10419    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10420  }
10421
10422  // SSE/AVX shift intrinsics
10423  case Intrinsic::x86_sse2_psll_w:
10424  case Intrinsic::x86_sse2_psll_d:
10425  case Intrinsic::x86_sse2_psll_q:
10426  case Intrinsic::x86_avx2_psll_w:
10427  case Intrinsic::x86_avx2_psll_d:
10428  case Intrinsic::x86_avx2_psll_q:
10429  case Intrinsic::x86_sse2_psrl_w:
10430  case Intrinsic::x86_sse2_psrl_d:
10431  case Intrinsic::x86_sse2_psrl_q:
10432  case Intrinsic::x86_avx2_psrl_w:
10433  case Intrinsic::x86_avx2_psrl_d:
10434  case Intrinsic::x86_avx2_psrl_q:
10435  case Intrinsic::x86_sse2_psra_w:
10436  case Intrinsic::x86_sse2_psra_d:
10437  case Intrinsic::x86_avx2_psra_w:
10438  case Intrinsic::x86_avx2_psra_d: {
10439    unsigned Opcode;
10440    switch (IntNo) {
10441    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10442    case Intrinsic::x86_sse2_psll_w:
10443    case Intrinsic::x86_sse2_psll_d:
10444    case Intrinsic::x86_sse2_psll_q:
10445    case Intrinsic::x86_avx2_psll_w:
10446    case Intrinsic::x86_avx2_psll_d:
10447    case Intrinsic::x86_avx2_psll_q:
10448      Opcode = X86ISD::VSHL;
10449      break;
10450    case Intrinsic::x86_sse2_psrl_w:
10451    case Intrinsic::x86_sse2_psrl_d:
10452    case Intrinsic::x86_sse2_psrl_q:
10453    case Intrinsic::x86_avx2_psrl_w:
10454    case Intrinsic::x86_avx2_psrl_d:
10455    case Intrinsic::x86_avx2_psrl_q:
10456      Opcode = X86ISD::VSRL;
10457      break;
10458    case Intrinsic::x86_sse2_psra_w:
10459    case Intrinsic::x86_sse2_psra_d:
10460    case Intrinsic::x86_avx2_psra_w:
10461    case Intrinsic::x86_avx2_psra_d:
10462      Opcode = X86ISD::VSRA;
10463      break;
10464    }
10465    return DAG.getNode(Opcode, dl, Op.getValueType(),
10466                       Op.getOperand(1), Op.getOperand(2));
10467  }
10468
10469  // SSE/AVX immediate shift intrinsics
10470  case Intrinsic::x86_sse2_pslli_w:
10471  case Intrinsic::x86_sse2_pslli_d:
10472  case Intrinsic::x86_sse2_pslli_q:
10473  case Intrinsic::x86_avx2_pslli_w:
10474  case Intrinsic::x86_avx2_pslli_d:
10475  case Intrinsic::x86_avx2_pslli_q:
10476  case Intrinsic::x86_sse2_psrli_w:
10477  case Intrinsic::x86_sse2_psrli_d:
10478  case Intrinsic::x86_sse2_psrli_q:
10479  case Intrinsic::x86_avx2_psrli_w:
10480  case Intrinsic::x86_avx2_psrli_d:
10481  case Intrinsic::x86_avx2_psrli_q:
10482  case Intrinsic::x86_sse2_psrai_w:
10483  case Intrinsic::x86_sse2_psrai_d:
10484  case Intrinsic::x86_avx2_psrai_w:
10485  case Intrinsic::x86_avx2_psrai_d: {
10486    unsigned Opcode;
10487    switch (IntNo) {
10488    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10489    case Intrinsic::x86_sse2_pslli_w:
10490    case Intrinsic::x86_sse2_pslli_d:
10491    case Intrinsic::x86_sse2_pslli_q:
10492    case Intrinsic::x86_avx2_pslli_w:
10493    case Intrinsic::x86_avx2_pslli_d:
10494    case Intrinsic::x86_avx2_pslli_q:
10495      Opcode = X86ISD::VSHLI;
10496      break;
10497    case Intrinsic::x86_sse2_psrli_w:
10498    case Intrinsic::x86_sse2_psrli_d:
10499    case Intrinsic::x86_sse2_psrli_q:
10500    case Intrinsic::x86_avx2_psrli_w:
10501    case Intrinsic::x86_avx2_psrli_d:
10502    case Intrinsic::x86_avx2_psrli_q:
10503      Opcode = X86ISD::VSRLI;
10504      break;
10505    case Intrinsic::x86_sse2_psrai_w:
10506    case Intrinsic::x86_sse2_psrai_d:
10507    case Intrinsic::x86_avx2_psrai_w:
10508    case Intrinsic::x86_avx2_psrai_d:
10509      Opcode = X86ISD::VSRAI;
10510      break;
10511    }
10512    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
10513                               Op.getOperand(1), Op.getOperand(2), DAG);
10514  }
10515
10516  case Intrinsic::x86_sse42_pcmpistria128:
10517  case Intrinsic::x86_sse42_pcmpestria128:
10518  case Intrinsic::x86_sse42_pcmpistric128:
10519  case Intrinsic::x86_sse42_pcmpestric128:
10520  case Intrinsic::x86_sse42_pcmpistrio128:
10521  case Intrinsic::x86_sse42_pcmpestrio128:
10522  case Intrinsic::x86_sse42_pcmpistris128:
10523  case Intrinsic::x86_sse42_pcmpestris128:
10524  case Intrinsic::x86_sse42_pcmpistriz128:
10525  case Intrinsic::x86_sse42_pcmpestriz128: {
10526    unsigned Opcode;
10527    unsigned X86CC;
10528    switch (IntNo) {
10529    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10530    case Intrinsic::x86_sse42_pcmpistria128:
10531      Opcode = X86ISD::PCMPISTRI;
10532      X86CC = X86::COND_A;
10533      break;
10534    case Intrinsic::x86_sse42_pcmpestria128:
10535      Opcode = X86ISD::PCMPESTRI;
10536      X86CC = X86::COND_A;
10537      break;
10538    case Intrinsic::x86_sse42_pcmpistric128:
10539      Opcode = X86ISD::PCMPISTRI;
10540      X86CC = X86::COND_B;
10541      break;
10542    case Intrinsic::x86_sse42_pcmpestric128:
10543      Opcode = X86ISD::PCMPESTRI;
10544      X86CC = X86::COND_B;
10545      break;
10546    case Intrinsic::x86_sse42_pcmpistrio128:
10547      Opcode = X86ISD::PCMPISTRI;
10548      X86CC = X86::COND_O;
10549      break;
10550    case Intrinsic::x86_sse42_pcmpestrio128:
10551      Opcode = X86ISD::PCMPESTRI;
10552      X86CC = X86::COND_O;
10553      break;
10554    case Intrinsic::x86_sse42_pcmpistris128:
10555      Opcode = X86ISD::PCMPISTRI;
10556      X86CC = X86::COND_S;
10557      break;
10558    case Intrinsic::x86_sse42_pcmpestris128:
10559      Opcode = X86ISD::PCMPESTRI;
10560      X86CC = X86::COND_S;
10561      break;
10562    case Intrinsic::x86_sse42_pcmpistriz128:
10563      Opcode = X86ISD::PCMPISTRI;
10564      X86CC = X86::COND_E;
10565      break;
10566    case Intrinsic::x86_sse42_pcmpestriz128:
10567      Opcode = X86ISD::PCMPESTRI;
10568      X86CC = X86::COND_E;
10569      break;
10570    }
10571    SmallVector<SDValue, 5> NewOps;
10572    NewOps.append(Op->op_begin()+1, Op->op_end());
10573    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10574    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10575    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10576                                DAG.getConstant(X86CC, MVT::i8),
10577                                SDValue(PCMP.getNode(), 1));
10578    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10579  }
10580
10581  case Intrinsic::x86_sse42_pcmpistri128:
10582  case Intrinsic::x86_sse42_pcmpestri128: {
10583    unsigned Opcode;
10584    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
10585      Opcode = X86ISD::PCMPISTRI;
10586    else
10587      Opcode = X86ISD::PCMPESTRI;
10588
10589    SmallVector<SDValue, 5> NewOps;
10590    NewOps.append(Op->op_begin()+1, Op->op_end());
10591    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10592    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10593  }
10594  case Intrinsic::x86_fma_vfmadd_ps:
10595  case Intrinsic::x86_fma_vfmadd_pd:
10596  case Intrinsic::x86_fma_vfmsub_ps:
10597  case Intrinsic::x86_fma_vfmsub_pd:
10598  case Intrinsic::x86_fma_vfnmadd_ps:
10599  case Intrinsic::x86_fma_vfnmadd_pd:
10600  case Intrinsic::x86_fma_vfnmsub_ps:
10601  case Intrinsic::x86_fma_vfnmsub_pd:
10602  case Intrinsic::x86_fma_vfmaddsub_ps:
10603  case Intrinsic::x86_fma_vfmaddsub_pd:
10604  case Intrinsic::x86_fma_vfmsubadd_ps:
10605  case Intrinsic::x86_fma_vfmsubadd_pd:
10606  case Intrinsic::x86_fma_vfmadd_ps_256:
10607  case Intrinsic::x86_fma_vfmadd_pd_256:
10608  case Intrinsic::x86_fma_vfmsub_ps_256:
10609  case Intrinsic::x86_fma_vfmsub_pd_256:
10610  case Intrinsic::x86_fma_vfnmadd_ps_256:
10611  case Intrinsic::x86_fma_vfnmadd_pd_256:
10612  case Intrinsic::x86_fma_vfnmsub_ps_256:
10613  case Intrinsic::x86_fma_vfnmsub_pd_256:
10614  case Intrinsic::x86_fma_vfmaddsub_ps_256:
10615  case Intrinsic::x86_fma_vfmaddsub_pd_256:
10616  case Intrinsic::x86_fma_vfmsubadd_ps_256:
10617  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
10618    unsigned Opc;
10619    switch (IntNo) {
10620    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10621    case Intrinsic::x86_fma_vfmadd_ps:
10622    case Intrinsic::x86_fma_vfmadd_pd:
10623    case Intrinsic::x86_fma_vfmadd_ps_256:
10624    case Intrinsic::x86_fma_vfmadd_pd_256:
10625      Opc = X86ISD::FMADD;
10626      break;
10627    case Intrinsic::x86_fma_vfmsub_ps:
10628    case Intrinsic::x86_fma_vfmsub_pd:
10629    case Intrinsic::x86_fma_vfmsub_ps_256:
10630    case Intrinsic::x86_fma_vfmsub_pd_256:
10631      Opc = X86ISD::FMSUB;
10632      break;
10633    case Intrinsic::x86_fma_vfnmadd_ps:
10634    case Intrinsic::x86_fma_vfnmadd_pd:
10635    case Intrinsic::x86_fma_vfnmadd_ps_256:
10636    case Intrinsic::x86_fma_vfnmadd_pd_256:
10637      Opc = X86ISD::FNMADD;
10638      break;
10639    case Intrinsic::x86_fma_vfnmsub_ps:
10640    case Intrinsic::x86_fma_vfnmsub_pd:
10641    case Intrinsic::x86_fma_vfnmsub_ps_256:
10642    case Intrinsic::x86_fma_vfnmsub_pd_256:
10643      Opc = X86ISD::FNMSUB;
10644      break;
10645    case Intrinsic::x86_fma_vfmaddsub_ps:
10646    case Intrinsic::x86_fma_vfmaddsub_pd:
10647    case Intrinsic::x86_fma_vfmaddsub_ps_256:
10648    case Intrinsic::x86_fma_vfmaddsub_pd_256:
10649      Opc = X86ISD::FMADDSUB;
10650      break;
10651    case Intrinsic::x86_fma_vfmsubadd_ps:
10652    case Intrinsic::x86_fma_vfmsubadd_pd:
10653    case Intrinsic::x86_fma_vfmsubadd_ps_256:
10654    case Intrinsic::x86_fma_vfmsubadd_pd_256:
10655      Opc = X86ISD::FMSUBADD;
10656      break;
10657    }
10658
10659    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
10660                       Op.getOperand(2), Op.getOperand(3));
10661  }
10662  }
10663}
10664
10665static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
10666  DebugLoc dl = Op.getDebugLoc();
10667  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10668  switch (IntNo) {
10669  default: return SDValue();    // Don't custom lower most intrinsics.
10670
10671  // RDRAND intrinsics.
10672  case Intrinsic::x86_rdrand_16:
10673  case Intrinsic::x86_rdrand_32:
10674  case Intrinsic::x86_rdrand_64: {
10675    // Emit the node with the right value type.
10676    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
10677    SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
10678
10679    // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
10680    // return the value from Rand, which is always 0, casted to i32.
10681    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
10682                      DAG.getConstant(1, Op->getValueType(1)),
10683                      DAG.getConstant(X86::COND_B, MVT::i32),
10684                      SDValue(Result.getNode(), 1) };
10685    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
10686                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
10687                                  Ops, 4);
10688
10689    // Return { result, isValid, chain }.
10690    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
10691                       SDValue(Result.getNode(), 2));
10692  }
10693  }
10694}
10695
10696SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
10697                                           SelectionDAG &DAG) const {
10698  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
10699  MFI->setReturnAddressIsTaken(true);
10700
10701  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10702  DebugLoc dl = Op.getDebugLoc();
10703  EVT PtrVT = getPointerTy();
10704
10705  if (Depth > 0) {
10706    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10707    SDValue Offset =
10708      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
10709    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
10710                       DAG.getNode(ISD::ADD, dl, PtrVT,
10711                                   FrameAddr, Offset),
10712                       MachinePointerInfo(), false, false, false, 0);
10713  }
10714
10715  // Just load the return address.
10716  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
10717  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
10718                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
10719}
10720
10721SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
10722  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
10723  MFI->setFrameAddressIsTaken(true);
10724
10725  EVT VT = Op.getValueType();
10726  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
10727  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10728  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
10729  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
10730  while (Depth--)
10731    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
10732                            MachinePointerInfo(),
10733                            false, false, false, 0);
10734  return FrameAddr;
10735}
10736
10737SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
10738                                                     SelectionDAG &DAG) const {
10739  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
10740}
10741
10742SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
10743  SDValue Chain     = Op.getOperand(0);
10744  SDValue Offset    = Op.getOperand(1);
10745  SDValue Handler   = Op.getOperand(2);
10746  DebugLoc dl       = Op.getDebugLoc();
10747
10748  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
10749                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
10750                                     getPointerTy());
10751  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
10752
10753  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
10754                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
10755  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
10756  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
10757                       false, false, 0);
10758  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
10759
10760  return DAG.getNode(X86ISD::EH_RETURN, dl,
10761                     MVT::Other,
10762                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
10763}
10764
10765SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
10766                                               SelectionDAG &DAG) const {
10767  DebugLoc DL = Op.getDebugLoc();
10768  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
10769                     DAG.getVTList(MVT::i32, MVT::Other),
10770                     Op.getOperand(0), Op.getOperand(1));
10771}
10772
10773SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
10774                                                SelectionDAG &DAG) const {
10775  DebugLoc DL = Op.getDebugLoc();
10776  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
10777                     Op.getOperand(0), Op.getOperand(1));
10778}
10779
10780static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
10781  return Op.getOperand(0);
10782}
10783
10784SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
10785                                                SelectionDAG &DAG) const {
10786  SDValue Root = Op.getOperand(0);
10787  SDValue Trmp = Op.getOperand(1); // trampoline
10788  SDValue FPtr = Op.getOperand(2); // nested function
10789  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
10790  DebugLoc dl  = Op.getDebugLoc();
10791
10792  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10793  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
10794
10795  if (Subtarget->is64Bit()) {
10796    SDValue OutChains[6];
10797
10798    // Large code-model.
10799    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
10800    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
10801
10802    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
10803    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
10804
10805    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
10806
10807    // Load the pointer to the nested function into R11.
10808    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
10809    SDValue Addr = Trmp;
10810    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10811                                Addr, MachinePointerInfo(TrmpAddr),
10812                                false, false, 0);
10813
10814    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10815                       DAG.getConstant(2, MVT::i64));
10816    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
10817                                MachinePointerInfo(TrmpAddr, 2),
10818                                false, false, 2);
10819
10820    // Load the 'nest' parameter value into R10.
10821    // R10 is specified in X86CallingConv.td
10822    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
10823    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10824                       DAG.getConstant(10, MVT::i64));
10825    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10826                                Addr, MachinePointerInfo(TrmpAddr, 10),
10827                                false, false, 0);
10828
10829    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10830                       DAG.getConstant(12, MVT::i64));
10831    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
10832                                MachinePointerInfo(TrmpAddr, 12),
10833                                false, false, 2);
10834
10835    // Jump to the nested function.
10836    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
10837    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10838                       DAG.getConstant(20, MVT::i64));
10839    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10840                                Addr, MachinePointerInfo(TrmpAddr, 20),
10841                                false, false, 0);
10842
10843    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
10844    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10845                       DAG.getConstant(22, MVT::i64));
10846    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
10847                                MachinePointerInfo(TrmpAddr, 22),
10848                                false, false, 0);
10849
10850    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
10851  } else {
10852    const Function *Func =
10853      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
10854    CallingConv::ID CC = Func->getCallingConv();
10855    unsigned NestReg;
10856
10857    switch (CC) {
10858    default:
10859      llvm_unreachable("Unsupported calling convention");
10860    case CallingConv::C:
10861    case CallingConv::X86_StdCall: {
10862      // Pass 'nest' parameter in ECX.
10863      // Must be kept in sync with X86CallingConv.td
10864      NestReg = X86::ECX;
10865
10866      // Check that ECX wasn't needed by an 'inreg' parameter.
10867      FunctionType *FTy = Func->getFunctionType();
10868      const AttributeSet &Attrs = Func->getAttributes();
10869
10870      if (!Attrs.isEmpty() && !Func->isVarArg()) {
10871        unsigned InRegCount = 0;
10872        unsigned Idx = 1;
10873
10874        for (FunctionType::param_iterator I = FTy->param_begin(),
10875             E = FTy->param_end(); I != E; ++I, ++Idx)
10876          if (Attrs.getParamAttributes(Idx).hasAttribute(Attribute::InReg))
10877            // FIXME: should only count parameters that are lowered to integers.
10878            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
10879
10880        if (InRegCount > 2) {
10881          report_fatal_error("Nest register in use - reduce number of inreg"
10882                             " parameters!");
10883        }
10884      }
10885      break;
10886    }
10887    case CallingConv::X86_FastCall:
10888    case CallingConv::X86_ThisCall:
10889    case CallingConv::Fast:
10890      // Pass 'nest' parameter in EAX.
10891      // Must be kept in sync with X86CallingConv.td
10892      NestReg = X86::EAX;
10893      break;
10894    }
10895
10896    SDValue OutChains[4];
10897    SDValue Addr, Disp;
10898
10899    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10900                       DAG.getConstant(10, MVT::i32));
10901    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
10902
10903    // This is storing the opcode for MOV32ri.
10904    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
10905    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
10906    OutChains[0] = DAG.getStore(Root, dl,
10907                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
10908                                Trmp, MachinePointerInfo(TrmpAddr),
10909                                false, false, 0);
10910
10911    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10912                       DAG.getConstant(1, MVT::i32));
10913    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
10914                                MachinePointerInfo(TrmpAddr, 1),
10915                                false, false, 1);
10916
10917    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
10918    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10919                       DAG.getConstant(5, MVT::i32));
10920    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
10921                                MachinePointerInfo(TrmpAddr, 5),
10922                                false, false, 1);
10923
10924    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10925                       DAG.getConstant(6, MVT::i32));
10926    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
10927                                MachinePointerInfo(TrmpAddr, 6),
10928                                false, false, 1);
10929
10930    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
10931  }
10932}
10933
10934SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
10935                                            SelectionDAG &DAG) const {
10936  /*
10937   The rounding mode is in bits 11:10 of FPSR, and has the following
10938   settings:
10939     00 Round to nearest
10940     01 Round to -inf
10941     10 Round to +inf
10942     11 Round to 0
10943
10944  FLT_ROUNDS, on the other hand, expects the following:
10945    -1 Undefined
10946     0 Round to 0
10947     1 Round to nearest
10948     2 Round to +inf
10949     3 Round to -inf
10950
10951  To perform the conversion, we do:
10952    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
10953  */
10954
10955  MachineFunction &MF = DAG.getMachineFunction();
10956  const TargetMachine &TM = MF.getTarget();
10957  const TargetFrameLowering &TFI = *TM.getFrameLowering();
10958  unsigned StackAlignment = TFI.getStackAlignment();
10959  EVT VT = Op.getValueType();
10960  DebugLoc DL = Op.getDebugLoc();
10961
10962  // Save FP Control Word to stack slot
10963  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
10964  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
10965
10966  MachineMemOperand *MMO =
10967   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
10968                           MachineMemOperand::MOStore, 2, 2);
10969
10970  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
10971  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
10972                                          DAG.getVTList(MVT::Other),
10973                                          Ops, 2, MVT::i16, MMO);
10974
10975  // Load FP Control Word from stack slot
10976  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
10977                            MachinePointerInfo(), false, false, false, 0);
10978
10979  // Transform as necessary
10980  SDValue CWD1 =
10981    DAG.getNode(ISD::SRL, DL, MVT::i16,
10982                DAG.getNode(ISD::AND, DL, MVT::i16,
10983                            CWD, DAG.getConstant(0x800, MVT::i16)),
10984                DAG.getConstant(11, MVT::i8));
10985  SDValue CWD2 =
10986    DAG.getNode(ISD::SRL, DL, MVT::i16,
10987                DAG.getNode(ISD::AND, DL, MVT::i16,
10988                            CWD, DAG.getConstant(0x400, MVT::i16)),
10989                DAG.getConstant(9, MVT::i8));
10990
10991  SDValue RetVal =
10992    DAG.getNode(ISD::AND, DL, MVT::i16,
10993                DAG.getNode(ISD::ADD, DL, MVT::i16,
10994                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
10995                            DAG.getConstant(1, MVT::i16)),
10996                DAG.getConstant(3, MVT::i16));
10997
10998  return DAG.getNode((VT.getSizeInBits() < 16 ?
10999                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
11000}
11001
11002static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
11003  EVT VT = Op.getValueType();
11004  EVT OpVT = VT;
11005  unsigned NumBits = VT.getSizeInBits();
11006  DebugLoc dl = Op.getDebugLoc();
11007
11008  Op = Op.getOperand(0);
11009  if (VT == MVT::i8) {
11010    // Zero extend to i32 since there is not an i8 bsr.
11011    OpVT = MVT::i32;
11012    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
11013  }
11014
11015  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
11016  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
11017  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
11018
11019  // If src is zero (i.e. bsr sets ZF), returns NumBits.
11020  SDValue Ops[] = {
11021    Op,
11022    DAG.getConstant(NumBits+NumBits-1, OpVT),
11023    DAG.getConstant(X86::COND_E, MVT::i8),
11024    Op.getValue(1)
11025  };
11026  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
11027
11028  // Finally xor with NumBits-1.
11029  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
11030
11031  if (VT == MVT::i8)
11032    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
11033  return Op;
11034}
11035
11036static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
11037  EVT VT = Op.getValueType();
11038  EVT OpVT = VT;
11039  unsigned NumBits = VT.getSizeInBits();
11040  DebugLoc dl = Op.getDebugLoc();
11041
11042  Op = Op.getOperand(0);
11043  if (VT == MVT::i8) {
11044    // Zero extend to i32 since there is not an i8 bsr.
11045    OpVT = MVT::i32;
11046    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
11047  }
11048
11049  // Issue a bsr (scan bits in reverse).
11050  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
11051  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
11052
11053  // And xor with NumBits-1.
11054  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
11055
11056  if (VT == MVT::i8)
11057    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
11058  return Op;
11059}
11060
11061static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
11062  EVT VT = Op.getValueType();
11063  unsigned NumBits = VT.getSizeInBits();
11064  DebugLoc dl = Op.getDebugLoc();
11065  Op = Op.getOperand(0);
11066
11067  // Issue a bsf (scan bits forward) which also sets EFLAGS.
11068  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
11069  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
11070
11071  // If src is zero (i.e. bsf sets ZF), returns NumBits.
11072  SDValue Ops[] = {
11073    Op,
11074    DAG.getConstant(NumBits, VT),
11075    DAG.getConstant(X86::COND_E, MVT::i8),
11076    Op.getValue(1)
11077  };
11078  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
11079}
11080
11081// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
11082// ones, and then concatenate the result back.
11083static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
11084  EVT VT = Op.getValueType();
11085
11086  assert(VT.is256BitVector() && VT.isInteger() &&
11087         "Unsupported value type for operation");
11088
11089  unsigned NumElems = VT.getVectorNumElements();
11090  DebugLoc dl = Op.getDebugLoc();
11091
11092  // Extract the LHS vectors
11093  SDValue LHS = Op.getOperand(0);
11094  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
11095  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
11096
11097  // Extract the RHS vectors
11098  SDValue RHS = Op.getOperand(1);
11099  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
11100  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
11101
11102  MVT EltVT = VT.getVectorElementType().getSimpleVT();
11103  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
11104
11105  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
11106                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
11107                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
11108}
11109
11110static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
11111  assert(Op.getValueType().is256BitVector() &&
11112         Op.getValueType().isInteger() &&
11113         "Only handle AVX 256-bit vector integer operation");
11114  return Lower256IntArith(Op, DAG);
11115}
11116
11117static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
11118  assert(Op.getValueType().is256BitVector() &&
11119         Op.getValueType().isInteger() &&
11120         "Only handle AVX 256-bit vector integer operation");
11121  return Lower256IntArith(Op, DAG);
11122}
11123
11124static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
11125                        SelectionDAG &DAG) {
11126  DebugLoc dl = Op.getDebugLoc();
11127  EVT VT = Op.getValueType();
11128
11129  // Decompose 256-bit ops into smaller 128-bit ops.
11130  if (VT.is256BitVector() && !Subtarget->hasInt256())
11131    return Lower256IntArith(Op, DAG);
11132
11133  SDValue A = Op.getOperand(0);
11134  SDValue B = Op.getOperand(1);
11135
11136  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
11137  if (VT == MVT::v4i32) {
11138    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
11139           "Should not custom lower when pmuldq is available!");
11140
11141    // Extract the odd parts.
11142    const int UnpackMask[] = { 1, -1, 3, -1 };
11143    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
11144    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
11145
11146    // Multiply the even parts.
11147    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
11148    // Now multiply odd parts.
11149    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
11150
11151    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
11152    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
11153
11154    // Merge the two vectors back together with a shuffle. This expands into 2
11155    // shuffles.
11156    const int ShufMask[] = { 0, 4, 2, 6 };
11157    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
11158  }
11159
11160  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
11161         "Only know how to lower V2I64/V4I64 multiply");
11162
11163  //  Ahi = psrlqi(a, 32);
11164  //  Bhi = psrlqi(b, 32);
11165  //
11166  //  AloBlo = pmuludq(a, b);
11167  //  AloBhi = pmuludq(a, Bhi);
11168  //  AhiBlo = pmuludq(Ahi, b);
11169
11170  //  AloBhi = psllqi(AloBhi, 32);
11171  //  AhiBlo = psllqi(AhiBlo, 32);
11172  //  return AloBlo + AloBhi + AhiBlo;
11173
11174  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
11175
11176  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
11177  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
11178
11179  // Bit cast to 32-bit vectors for MULUDQ
11180  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
11181  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
11182  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
11183  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
11184  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
11185
11186  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
11187  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
11188  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
11189
11190  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
11191  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
11192
11193  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
11194  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
11195}
11196
11197SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
11198
11199  EVT VT = Op.getValueType();
11200  DebugLoc dl = Op.getDebugLoc();
11201  SDValue R = Op.getOperand(0);
11202  SDValue Amt = Op.getOperand(1);
11203  LLVMContext *Context = DAG.getContext();
11204
11205  if (!Subtarget->hasSSE2())
11206    return SDValue();
11207
11208  // Optimize shl/srl/sra with constant shift amount.
11209  if (isSplatVector(Amt.getNode())) {
11210    SDValue SclrAmt = Amt->getOperand(0);
11211    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
11212      uint64_t ShiftAmt = C->getZExtValue();
11213
11214      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
11215          (Subtarget->hasInt256() &&
11216           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
11217        if (Op.getOpcode() == ISD::SHL)
11218          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
11219                             DAG.getConstant(ShiftAmt, MVT::i32));
11220        if (Op.getOpcode() == ISD::SRL)
11221          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
11222                             DAG.getConstant(ShiftAmt, MVT::i32));
11223        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
11224          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
11225                             DAG.getConstant(ShiftAmt, MVT::i32));
11226      }
11227
11228      if (VT == MVT::v16i8) {
11229        if (Op.getOpcode() == ISD::SHL) {
11230          // Make a large shift.
11231          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
11232                                    DAG.getConstant(ShiftAmt, MVT::i32));
11233          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
11234          // Zero out the rightmost bits.
11235          SmallVector<SDValue, 16> V(16,
11236                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
11237                                                     MVT::i8));
11238          return DAG.getNode(ISD::AND, dl, VT, SHL,
11239                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
11240        }
11241        if (Op.getOpcode() == ISD::SRL) {
11242          // Make a large shift.
11243          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
11244                                    DAG.getConstant(ShiftAmt, MVT::i32));
11245          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
11246          // Zero out the leftmost bits.
11247          SmallVector<SDValue, 16> V(16,
11248                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
11249                                                     MVT::i8));
11250          return DAG.getNode(ISD::AND, dl, VT, SRL,
11251                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
11252        }
11253        if (Op.getOpcode() == ISD::SRA) {
11254          if (ShiftAmt == 7) {
11255            // R s>> 7  ===  R s< 0
11256            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
11257            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
11258          }
11259
11260          // R s>> a === ((R u>> a) ^ m) - m
11261          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
11262          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
11263                                                         MVT::i8));
11264          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
11265          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
11266          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
11267          return Res;
11268        }
11269        llvm_unreachable("Unknown shift opcode.");
11270      }
11271
11272      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
11273        if (Op.getOpcode() == ISD::SHL) {
11274          // Make a large shift.
11275          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
11276                                    DAG.getConstant(ShiftAmt, MVT::i32));
11277          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
11278          // Zero out the rightmost bits.
11279          SmallVector<SDValue, 32> V(32,
11280                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
11281                                                     MVT::i8));
11282          return DAG.getNode(ISD::AND, dl, VT, SHL,
11283                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
11284        }
11285        if (Op.getOpcode() == ISD::SRL) {
11286          // Make a large shift.
11287          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
11288                                    DAG.getConstant(ShiftAmt, MVT::i32));
11289          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
11290          // Zero out the leftmost bits.
11291          SmallVector<SDValue, 32> V(32,
11292                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
11293                                                     MVT::i8));
11294          return DAG.getNode(ISD::AND, dl, VT, SRL,
11295                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
11296        }
11297        if (Op.getOpcode() == ISD::SRA) {
11298          if (ShiftAmt == 7) {
11299            // R s>> 7  ===  R s< 0
11300            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
11301            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
11302          }
11303
11304          // R s>> a === ((R u>> a) ^ m) - m
11305          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
11306          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
11307                                                         MVT::i8));
11308          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
11309          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
11310          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
11311          return Res;
11312        }
11313        llvm_unreachable("Unknown shift opcode.");
11314      }
11315    }
11316  }
11317
11318  // Lower SHL with variable shift amount.
11319  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
11320    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
11321                     DAG.getConstant(23, MVT::i32));
11322
11323    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
11324    Constant *C = ConstantDataVector::get(*Context, CV);
11325    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
11326    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
11327                                 MachinePointerInfo::getConstantPool(),
11328                                 false, false, false, 16);
11329
11330    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
11331    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
11332    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
11333    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
11334  }
11335  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
11336    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
11337
11338    // a = a << 5;
11339    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
11340                     DAG.getConstant(5, MVT::i32));
11341    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
11342
11343    // Turn 'a' into a mask suitable for VSELECT
11344    SDValue VSelM = DAG.getConstant(0x80, VT);
11345    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11346    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11347
11348    SDValue CM1 = DAG.getConstant(0x0f, VT);
11349    SDValue CM2 = DAG.getConstant(0x3f, VT);
11350
11351    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
11352    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
11353    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
11354                            DAG.getConstant(4, MVT::i32), DAG);
11355    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
11356    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
11357
11358    // a += a
11359    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
11360    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11361    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11362
11363    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
11364    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
11365    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
11366                            DAG.getConstant(2, MVT::i32), DAG);
11367    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
11368    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
11369
11370    // a += a
11371    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
11372    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
11373    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
11374
11375    // return VSELECT(r, r+r, a);
11376    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
11377                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
11378    return R;
11379  }
11380
11381  // Decompose 256-bit shifts into smaller 128-bit shifts.
11382  if (VT.is256BitVector()) {
11383    unsigned NumElems = VT.getVectorNumElements();
11384    MVT EltVT = VT.getVectorElementType().getSimpleVT();
11385    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
11386
11387    // Extract the two vectors
11388    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
11389    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
11390
11391    // Recreate the shift amount vectors
11392    SDValue Amt1, Amt2;
11393    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
11394      // Constant shift amount
11395      SmallVector<SDValue, 4> Amt1Csts;
11396      SmallVector<SDValue, 4> Amt2Csts;
11397      for (unsigned i = 0; i != NumElems/2; ++i)
11398        Amt1Csts.push_back(Amt->getOperand(i));
11399      for (unsigned i = NumElems/2; i != NumElems; ++i)
11400        Amt2Csts.push_back(Amt->getOperand(i));
11401
11402      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
11403                                 &Amt1Csts[0], NumElems/2);
11404      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
11405                                 &Amt2Csts[0], NumElems/2);
11406    } else {
11407      // Variable shift amount
11408      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
11409      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
11410    }
11411
11412    // Issue new vector shifts for the smaller types
11413    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
11414    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
11415
11416    // Concatenate the result back
11417    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
11418  }
11419
11420  return SDValue();
11421}
11422
11423static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
11424  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
11425  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
11426  // looks for this combo and may remove the "setcc" instruction if the "setcc"
11427  // has only one use.
11428  SDNode *N = Op.getNode();
11429  SDValue LHS = N->getOperand(0);
11430  SDValue RHS = N->getOperand(1);
11431  unsigned BaseOp = 0;
11432  unsigned Cond = 0;
11433  DebugLoc DL = Op.getDebugLoc();
11434  switch (Op.getOpcode()) {
11435  default: llvm_unreachable("Unknown ovf instruction!");
11436  case ISD::SADDO:
11437    // A subtract of one will be selected as a INC. Note that INC doesn't
11438    // set CF, so we can't do this for UADDO.
11439    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
11440      if (C->isOne()) {
11441        BaseOp = X86ISD::INC;
11442        Cond = X86::COND_O;
11443        break;
11444      }
11445    BaseOp = X86ISD::ADD;
11446    Cond = X86::COND_O;
11447    break;
11448  case ISD::UADDO:
11449    BaseOp = X86ISD::ADD;
11450    Cond = X86::COND_B;
11451    break;
11452  case ISD::SSUBO:
11453    // A subtract of one will be selected as a DEC. Note that DEC doesn't
11454    // set CF, so we can't do this for USUBO.
11455    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
11456      if (C->isOne()) {
11457        BaseOp = X86ISD::DEC;
11458        Cond = X86::COND_O;
11459        break;
11460      }
11461    BaseOp = X86ISD::SUB;
11462    Cond = X86::COND_O;
11463    break;
11464  case ISD::USUBO:
11465    BaseOp = X86ISD::SUB;
11466    Cond = X86::COND_B;
11467    break;
11468  case ISD::SMULO:
11469    BaseOp = X86ISD::SMUL;
11470    Cond = X86::COND_O;
11471    break;
11472  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
11473    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
11474                                 MVT::i32);
11475    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
11476
11477    SDValue SetCC =
11478      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
11479                  DAG.getConstant(X86::COND_O, MVT::i32),
11480                  SDValue(Sum.getNode(), 2));
11481
11482    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
11483  }
11484  }
11485
11486  // Also sets EFLAGS.
11487  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
11488  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
11489
11490  SDValue SetCC =
11491    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
11492                DAG.getConstant(Cond, MVT::i32),
11493                SDValue(Sum.getNode(), 1));
11494
11495  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
11496}
11497
11498SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
11499                                                  SelectionDAG &DAG) const {
11500  DebugLoc dl = Op.getDebugLoc();
11501  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
11502  EVT VT = Op.getValueType();
11503
11504  if (!Subtarget->hasSSE2() || !VT.isVector())
11505    return SDValue();
11506
11507  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
11508                      ExtraVT.getScalarType().getSizeInBits();
11509  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
11510
11511  switch (VT.getSimpleVT().SimpleTy) {
11512    default: return SDValue();
11513    case MVT::v8i32:
11514    case MVT::v16i16:
11515      if (!Subtarget->hasFp256())
11516        return SDValue();
11517      if (!Subtarget->hasInt256()) {
11518        // needs to be split
11519        unsigned NumElems = VT.getVectorNumElements();
11520
11521        // Extract the LHS vectors
11522        SDValue LHS = Op.getOperand(0);
11523        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
11524        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
11525
11526        MVT EltVT = VT.getVectorElementType().getSimpleVT();
11527        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
11528
11529        EVT ExtraEltVT = ExtraVT.getVectorElementType();
11530        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
11531        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
11532                                   ExtraNumElems/2);
11533        SDValue Extra = DAG.getValueType(ExtraVT);
11534
11535        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
11536        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
11537
11538        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
11539      }
11540      // fall through
11541    case MVT::v4i32:
11542    case MVT::v8i16: {
11543      SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
11544                                         Op.getOperand(0), ShAmt, DAG);
11545      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
11546    }
11547  }
11548}
11549
11550static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
11551                              SelectionDAG &DAG) {
11552  DebugLoc dl = Op.getDebugLoc();
11553
11554  // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
11555  // There isn't any reason to disable it if the target processor supports it.
11556  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
11557    SDValue Chain = Op.getOperand(0);
11558    SDValue Zero = DAG.getConstant(0, MVT::i32);
11559    SDValue Ops[] = {
11560      DAG.getRegister(X86::ESP, MVT::i32), // Base
11561      DAG.getTargetConstant(1, MVT::i8),   // Scale
11562      DAG.getRegister(0, MVT::i32),        // Index
11563      DAG.getTargetConstant(0, MVT::i32),  // Disp
11564      DAG.getRegister(0, MVT::i32),        // Segment.
11565      Zero,
11566      Chain
11567    };
11568    SDNode *Res =
11569      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
11570                          array_lengthof(Ops));
11571    return SDValue(Res, 0);
11572  }
11573
11574  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
11575  if (!isDev)
11576    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
11577
11578  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11579  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
11580  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
11581  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
11582
11583  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
11584  if (!Op1 && !Op2 && !Op3 && Op4)
11585    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
11586
11587  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
11588  if (Op1 && !Op2 && !Op3 && !Op4)
11589    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
11590
11591  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
11592  //           (MFENCE)>;
11593  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
11594}
11595
11596static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
11597                                 SelectionDAG &DAG) {
11598  DebugLoc dl = Op.getDebugLoc();
11599  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
11600    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
11601  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
11602    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
11603
11604  // The only fence that needs an instruction is a sequentially-consistent
11605  // cross-thread fence.
11606  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
11607    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
11608    // no-sse2). There isn't any reason to disable it if the target processor
11609    // supports it.
11610    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
11611      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
11612
11613    SDValue Chain = Op.getOperand(0);
11614    SDValue Zero = DAG.getConstant(0, MVT::i32);
11615    SDValue Ops[] = {
11616      DAG.getRegister(X86::ESP, MVT::i32), // Base
11617      DAG.getTargetConstant(1, MVT::i8),   // Scale
11618      DAG.getRegister(0, MVT::i32),        // Index
11619      DAG.getTargetConstant(0, MVT::i32),  // Disp
11620      DAG.getRegister(0, MVT::i32),        // Segment.
11621      Zero,
11622      Chain
11623    };
11624    SDNode *Res =
11625      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
11626                         array_lengthof(Ops));
11627    return SDValue(Res, 0);
11628  }
11629
11630  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
11631  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
11632}
11633
11634static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
11635                             SelectionDAG &DAG) {
11636  EVT T = Op.getValueType();
11637  DebugLoc DL = Op.getDebugLoc();
11638  unsigned Reg = 0;
11639  unsigned size = 0;
11640  switch(T.getSimpleVT().SimpleTy) {
11641  default: llvm_unreachable("Invalid value type!");
11642  case MVT::i8:  Reg = X86::AL;  size = 1; break;
11643  case MVT::i16: Reg = X86::AX;  size = 2; break;
11644  case MVT::i32: Reg = X86::EAX; size = 4; break;
11645  case MVT::i64:
11646    assert(Subtarget->is64Bit() && "Node not type legal!");
11647    Reg = X86::RAX; size = 8;
11648    break;
11649  }
11650  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
11651                                    Op.getOperand(2), SDValue());
11652  SDValue Ops[] = { cpIn.getValue(0),
11653                    Op.getOperand(1),
11654                    Op.getOperand(3),
11655                    DAG.getTargetConstant(size, MVT::i8),
11656                    cpIn.getValue(1) };
11657  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11658  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
11659  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
11660                                           Ops, 5, T, MMO);
11661  SDValue cpOut =
11662    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
11663  return cpOut;
11664}
11665
11666static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
11667                                     SelectionDAG &DAG) {
11668  assert(Subtarget->is64Bit() && "Result not type legalized?");
11669  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11670  SDValue TheChain = Op.getOperand(0);
11671  DebugLoc dl = Op.getDebugLoc();
11672  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11673  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
11674  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
11675                                   rax.getValue(2));
11676  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
11677                            DAG.getConstant(32, MVT::i8));
11678  SDValue Ops[] = {
11679    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
11680    rdx.getValue(1)
11681  };
11682  return DAG.getMergeValues(Ops, 2, dl);
11683}
11684
11685SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
11686  EVT SrcVT = Op.getOperand(0).getValueType();
11687  EVT DstVT = Op.getValueType();
11688  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
11689         Subtarget->hasMMX() && "Unexpected custom BITCAST");
11690  assert((DstVT == MVT::i64 ||
11691          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
11692         "Unexpected custom BITCAST");
11693  // i64 <=> MMX conversions are Legal.
11694  if (SrcVT==MVT::i64 && DstVT.isVector())
11695    return Op;
11696  if (DstVT==MVT::i64 && SrcVT.isVector())
11697    return Op;
11698  // MMX <=> MMX conversions are Legal.
11699  if (SrcVT.isVector() && DstVT.isVector())
11700    return Op;
11701  // All other conversions need to be expanded.
11702  return SDValue();
11703}
11704
11705static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
11706  SDNode *Node = Op.getNode();
11707  DebugLoc dl = Node->getDebugLoc();
11708  EVT T = Node->getValueType(0);
11709  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
11710                              DAG.getConstant(0, T), Node->getOperand(2));
11711  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
11712                       cast<AtomicSDNode>(Node)->getMemoryVT(),
11713                       Node->getOperand(0),
11714                       Node->getOperand(1), negOp,
11715                       cast<AtomicSDNode>(Node)->getSrcValue(),
11716                       cast<AtomicSDNode>(Node)->getAlignment(),
11717                       cast<AtomicSDNode>(Node)->getOrdering(),
11718                       cast<AtomicSDNode>(Node)->getSynchScope());
11719}
11720
11721static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
11722  SDNode *Node = Op.getNode();
11723  DebugLoc dl = Node->getDebugLoc();
11724  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
11725
11726  // Convert seq_cst store -> xchg
11727  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
11728  // FIXME: On 32-bit, store -> fist or movq would be more efficient
11729  //        (The only way to get a 16-byte store is cmpxchg16b)
11730  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
11731  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
11732      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
11733    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
11734                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
11735                                 Node->getOperand(0),
11736                                 Node->getOperand(1), Node->getOperand(2),
11737                                 cast<AtomicSDNode>(Node)->getMemOperand(),
11738                                 cast<AtomicSDNode>(Node)->getOrdering(),
11739                                 cast<AtomicSDNode>(Node)->getSynchScope());
11740    return Swap.getValue(1);
11741  }
11742  // Other atomic stores have a simple pattern.
11743  return Op;
11744}
11745
11746static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
11747  EVT VT = Op.getNode()->getValueType(0);
11748
11749  // Let legalize expand this if it isn't a legal type yet.
11750  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11751    return SDValue();
11752
11753  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
11754
11755  unsigned Opc;
11756  bool ExtraOp = false;
11757  switch (Op.getOpcode()) {
11758  default: llvm_unreachable("Invalid code");
11759  case ISD::ADDC: Opc = X86ISD::ADD; break;
11760  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
11761  case ISD::SUBC: Opc = X86ISD::SUB; break;
11762  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
11763  }
11764
11765  if (!ExtraOp)
11766    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
11767                       Op.getOperand(1));
11768  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
11769                     Op.getOperand(1), Op.getOperand(2));
11770}
11771
11772/// LowerOperation - Provide custom lowering hooks for some operations.
11773///
11774SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
11775  switch (Op.getOpcode()) {
11776  default: llvm_unreachable("Should not custom lower this!");
11777  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
11778  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, Subtarget, DAG);
11779  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
11780  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
11781  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
11782  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
11783  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
11784  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
11785  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
11786  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
11787  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
11788  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
11789  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
11790  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
11791  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
11792  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
11793  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
11794  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
11795  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
11796  case ISD::SHL_PARTS:
11797  case ISD::SRA_PARTS:
11798  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
11799  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
11800  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
11801  case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
11802  case ISD::ZERO_EXTEND:        return lowerZERO_EXTEND(Op, DAG);
11803  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
11804  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
11805  case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
11806  case ISD::FABS:               return LowerFABS(Op, DAG);
11807  case ISD::FNEG:               return LowerFNEG(Op, DAG);
11808  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
11809  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
11810  case ISD::SETCC:              return LowerSETCC(Op, DAG);
11811  case ISD::SELECT:             return LowerSELECT(Op, DAG);
11812  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
11813  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
11814  case ISD::VASTART:            return LowerVASTART(Op, DAG);
11815  case ISD::VAARG:              return LowerVAARG(Op, DAG);
11816  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
11817  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
11818  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
11819  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
11820  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
11821  case ISD::FRAME_TO_ARGS_OFFSET:
11822                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
11823  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
11824  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
11825  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
11826  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
11827  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
11828  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
11829  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
11830  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
11831  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
11832  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
11833  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
11834  case ISD::SRA:
11835  case ISD::SRL:
11836  case ISD::SHL:                return LowerShift(Op, DAG);
11837  case ISD::SADDO:
11838  case ISD::UADDO:
11839  case ISD::SSUBO:
11840  case ISD::USUBO:
11841  case ISD::SMULO:
11842  case ISD::UMULO:              return LowerXALUO(Op, DAG);
11843  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
11844  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
11845  case ISD::ADDC:
11846  case ISD::ADDE:
11847  case ISD::SUBC:
11848  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
11849  case ISD::ADD:                return LowerADD(Op, DAG);
11850  case ISD::SUB:                return LowerSUB(Op, DAG);
11851  }
11852}
11853
11854static void ReplaceATOMIC_LOAD(SDNode *Node,
11855                                  SmallVectorImpl<SDValue> &Results,
11856                                  SelectionDAG &DAG) {
11857  DebugLoc dl = Node->getDebugLoc();
11858  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
11859
11860  // Convert wide load -> cmpxchg8b/cmpxchg16b
11861  // FIXME: On 32-bit, load -> fild or movq would be more efficient
11862  //        (The only way to get a 16-byte load is cmpxchg16b)
11863  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
11864  SDValue Zero = DAG.getConstant(0, VT);
11865  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
11866                               Node->getOperand(0),
11867                               Node->getOperand(1), Zero, Zero,
11868                               cast<AtomicSDNode>(Node)->getMemOperand(),
11869                               cast<AtomicSDNode>(Node)->getOrdering(),
11870                               cast<AtomicSDNode>(Node)->getSynchScope());
11871  Results.push_back(Swap.getValue(0));
11872  Results.push_back(Swap.getValue(1));
11873}
11874
11875static void
11876ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
11877                        SelectionDAG &DAG, unsigned NewOp) {
11878  DebugLoc dl = Node->getDebugLoc();
11879  assert (Node->getValueType(0) == MVT::i64 &&
11880          "Only know how to expand i64 atomics");
11881
11882  SDValue Chain = Node->getOperand(0);
11883  SDValue In1 = Node->getOperand(1);
11884  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11885                             Node->getOperand(2), DAG.getIntPtrConstant(0));
11886  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11887                             Node->getOperand(2), DAG.getIntPtrConstant(1));
11888  SDValue Ops[] = { Chain, In1, In2L, In2H };
11889  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
11890  SDValue Result =
11891    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
11892                            cast<MemSDNode>(Node)->getMemOperand());
11893  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
11894  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
11895  Results.push_back(Result.getValue(2));
11896}
11897
11898/// ReplaceNodeResults - Replace a node with an illegal result type
11899/// with a new node built out of custom code.
11900void X86TargetLowering::ReplaceNodeResults(SDNode *N,
11901                                           SmallVectorImpl<SDValue>&Results,
11902                                           SelectionDAG &DAG) const {
11903  DebugLoc dl = N->getDebugLoc();
11904  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11905  switch (N->getOpcode()) {
11906  default:
11907    llvm_unreachable("Do not know how to custom type legalize this operation!");
11908  case ISD::SIGN_EXTEND_INREG:
11909  case ISD::ADDC:
11910  case ISD::ADDE:
11911  case ISD::SUBC:
11912  case ISD::SUBE:
11913    // We don't want to expand or promote these.
11914    return;
11915  case ISD::FP_TO_SINT:
11916  case ISD::FP_TO_UINT: {
11917    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
11918
11919    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
11920      return;
11921
11922    std::pair<SDValue,SDValue> Vals =
11923        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
11924    SDValue FIST = Vals.first, StackSlot = Vals.second;
11925    if (FIST.getNode() != 0) {
11926      EVT VT = N->getValueType(0);
11927      // Return a load from the stack slot.
11928      if (StackSlot.getNode() != 0)
11929        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
11930                                      MachinePointerInfo(),
11931                                      false, false, false, 0));
11932      else
11933        Results.push_back(FIST);
11934    }
11935    return;
11936  }
11937  case ISD::UINT_TO_FP: {
11938    if (N->getOperand(0).getValueType() != MVT::v2i32 &&
11939        N->getValueType(0) != MVT::v2f32)
11940      return;
11941    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
11942                                 N->getOperand(0));
11943    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
11944                                     MVT::f64);
11945    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
11946    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
11947                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
11948    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
11949    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
11950    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
11951    return;
11952  }
11953  case ISD::FP_ROUND: {
11954    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
11955        return;
11956    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
11957    Results.push_back(V);
11958    return;
11959  }
11960  case ISD::READCYCLECOUNTER: {
11961    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11962    SDValue TheChain = N->getOperand(0);
11963    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11964    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
11965                                     rd.getValue(1));
11966    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
11967                                     eax.getValue(2));
11968    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
11969    SDValue Ops[] = { eax, edx };
11970    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
11971    Results.push_back(edx.getValue(1));
11972    return;
11973  }
11974  case ISD::ATOMIC_CMP_SWAP: {
11975    EVT T = N->getValueType(0);
11976    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
11977    bool Regs64bit = T == MVT::i128;
11978    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
11979    SDValue cpInL, cpInH;
11980    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11981                        DAG.getConstant(0, HalfT));
11982    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11983                        DAG.getConstant(1, HalfT));
11984    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
11985                             Regs64bit ? X86::RAX : X86::EAX,
11986                             cpInL, SDValue());
11987    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
11988                             Regs64bit ? X86::RDX : X86::EDX,
11989                             cpInH, cpInL.getValue(1));
11990    SDValue swapInL, swapInH;
11991    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11992                          DAG.getConstant(0, HalfT));
11993    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11994                          DAG.getConstant(1, HalfT));
11995    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
11996                               Regs64bit ? X86::RBX : X86::EBX,
11997                               swapInL, cpInH.getValue(1));
11998    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
11999                               Regs64bit ? X86::RCX : X86::ECX,
12000                               swapInH, swapInL.getValue(1));
12001    SDValue Ops[] = { swapInH.getValue(0),
12002                      N->getOperand(1),
12003                      swapInH.getValue(1) };
12004    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
12005    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
12006    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
12007                                  X86ISD::LCMPXCHG8_DAG;
12008    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
12009                                             Ops, 3, T, MMO);
12010    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
12011                                        Regs64bit ? X86::RAX : X86::EAX,
12012                                        HalfT, Result.getValue(1));
12013    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
12014                                        Regs64bit ? X86::RDX : X86::EDX,
12015                                        HalfT, cpOutL.getValue(2));
12016    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
12017    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
12018    Results.push_back(cpOutH.getValue(1));
12019    return;
12020  }
12021  case ISD::ATOMIC_LOAD_ADD:
12022  case ISD::ATOMIC_LOAD_AND:
12023  case ISD::ATOMIC_LOAD_NAND:
12024  case ISD::ATOMIC_LOAD_OR:
12025  case ISD::ATOMIC_LOAD_SUB:
12026  case ISD::ATOMIC_LOAD_XOR:
12027  case ISD::ATOMIC_LOAD_MAX:
12028  case ISD::ATOMIC_LOAD_MIN:
12029  case ISD::ATOMIC_LOAD_UMAX:
12030  case ISD::ATOMIC_LOAD_UMIN:
12031  case ISD::ATOMIC_SWAP: {
12032    unsigned Opc;
12033    switch (N->getOpcode()) {
12034    default: llvm_unreachable("Unexpected opcode");
12035    case ISD::ATOMIC_LOAD_ADD:
12036      Opc = X86ISD::ATOMADD64_DAG;
12037      break;
12038    case ISD::ATOMIC_LOAD_AND:
12039      Opc = X86ISD::ATOMAND64_DAG;
12040      break;
12041    case ISD::ATOMIC_LOAD_NAND:
12042      Opc = X86ISD::ATOMNAND64_DAG;
12043      break;
12044    case ISD::ATOMIC_LOAD_OR:
12045      Opc = X86ISD::ATOMOR64_DAG;
12046      break;
12047    case ISD::ATOMIC_LOAD_SUB:
12048      Opc = X86ISD::ATOMSUB64_DAG;
12049      break;
12050    case ISD::ATOMIC_LOAD_XOR:
12051      Opc = X86ISD::ATOMXOR64_DAG;
12052      break;
12053    case ISD::ATOMIC_LOAD_MAX:
12054      Opc = X86ISD::ATOMMAX64_DAG;
12055      break;
12056    case ISD::ATOMIC_LOAD_MIN:
12057      Opc = X86ISD::ATOMMIN64_DAG;
12058      break;
12059    case ISD::ATOMIC_LOAD_UMAX:
12060      Opc = X86ISD::ATOMUMAX64_DAG;
12061      break;
12062    case ISD::ATOMIC_LOAD_UMIN:
12063      Opc = X86ISD::ATOMUMIN64_DAG;
12064      break;
12065    case ISD::ATOMIC_SWAP:
12066      Opc = X86ISD::ATOMSWAP64_DAG;
12067      break;
12068    }
12069    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
12070    return;
12071  }
12072  case ISD::ATOMIC_LOAD:
12073    ReplaceATOMIC_LOAD(N, Results, DAG);
12074  }
12075}
12076
12077const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
12078  switch (Opcode) {
12079  default: return NULL;
12080  case X86ISD::BSF:                return "X86ISD::BSF";
12081  case X86ISD::BSR:                return "X86ISD::BSR";
12082  case X86ISD::SHLD:               return "X86ISD::SHLD";
12083  case X86ISD::SHRD:               return "X86ISD::SHRD";
12084  case X86ISD::FAND:               return "X86ISD::FAND";
12085  case X86ISD::FOR:                return "X86ISD::FOR";
12086  case X86ISD::FXOR:               return "X86ISD::FXOR";
12087  case X86ISD::FSRL:               return "X86ISD::FSRL";
12088  case X86ISD::FILD:               return "X86ISD::FILD";
12089  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
12090  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
12091  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
12092  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
12093  case X86ISD::FLD:                return "X86ISD::FLD";
12094  case X86ISD::FST:                return "X86ISD::FST";
12095  case X86ISD::CALL:               return "X86ISD::CALL";
12096  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
12097  case X86ISD::BT:                 return "X86ISD::BT";
12098  case X86ISD::CMP:                return "X86ISD::CMP";
12099  case X86ISD::COMI:               return "X86ISD::COMI";
12100  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
12101  case X86ISD::SETCC:              return "X86ISD::SETCC";
12102  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
12103  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
12104  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
12105  case X86ISD::CMOV:               return "X86ISD::CMOV";
12106  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
12107  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
12108  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
12109  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
12110  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
12111  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
12112  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
12113  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
12114  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
12115  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
12116  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
12117  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
12118  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
12119  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
12120  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
12121  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
12122  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
12123  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
12124  case X86ISD::HADD:               return "X86ISD::HADD";
12125  case X86ISD::HSUB:               return "X86ISD::HSUB";
12126  case X86ISD::FHADD:              return "X86ISD::FHADD";
12127  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
12128  case X86ISD::UMAX:               return "X86ISD::UMAX";
12129  case X86ISD::UMIN:               return "X86ISD::UMIN";
12130  case X86ISD::SMAX:               return "X86ISD::SMAX";
12131  case X86ISD::SMIN:               return "X86ISD::SMIN";
12132  case X86ISD::FMAX:               return "X86ISD::FMAX";
12133  case X86ISD::FMIN:               return "X86ISD::FMIN";
12134  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
12135  case X86ISD::FMINC:              return "X86ISD::FMINC";
12136  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
12137  case X86ISD::FRCP:               return "X86ISD::FRCP";
12138  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
12139  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
12140  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
12141  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
12142  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
12143  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
12144  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
12145  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
12146  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
12147  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
12148  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
12149  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
12150  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
12151  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
12152  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
12153  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
12154  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
12155  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
12156  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
12157  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
12158  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
12159  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
12160  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
12161  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
12162  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
12163  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
12164  case X86ISD::VSHL:               return "X86ISD::VSHL";
12165  case X86ISD::VSRL:               return "X86ISD::VSRL";
12166  case X86ISD::VSRA:               return "X86ISD::VSRA";
12167  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
12168  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
12169  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
12170  case X86ISD::CMPP:               return "X86ISD::CMPP";
12171  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
12172  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
12173  case X86ISD::ADD:                return "X86ISD::ADD";
12174  case X86ISD::SUB:                return "X86ISD::SUB";
12175  case X86ISD::ADC:                return "X86ISD::ADC";
12176  case X86ISD::SBB:                return "X86ISD::SBB";
12177  case X86ISD::SMUL:               return "X86ISD::SMUL";
12178  case X86ISD::UMUL:               return "X86ISD::UMUL";
12179  case X86ISD::INC:                return "X86ISD::INC";
12180  case X86ISD::DEC:                return "X86ISD::DEC";
12181  case X86ISD::OR:                 return "X86ISD::OR";
12182  case X86ISD::XOR:                return "X86ISD::XOR";
12183  case X86ISD::AND:                return "X86ISD::AND";
12184  case X86ISD::BLSI:               return "X86ISD::BLSI";
12185  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
12186  case X86ISD::BLSR:               return "X86ISD::BLSR";
12187  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
12188  case X86ISD::PTEST:              return "X86ISD::PTEST";
12189  case X86ISD::TESTP:              return "X86ISD::TESTP";
12190  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
12191  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
12192  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
12193  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
12194  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
12195  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
12196  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
12197  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
12198  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
12199  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
12200  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
12201  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
12202  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
12203  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
12204  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
12205  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
12206  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
12207  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
12208  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
12209  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
12210  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
12211  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
12212  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
12213  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
12214  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
12215  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
12216  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
12217  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
12218  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
12219  case X86ISD::SAHF:               return "X86ISD::SAHF";
12220  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
12221  case X86ISD::FMADD:              return "X86ISD::FMADD";
12222  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
12223  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
12224  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
12225  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
12226  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
12227  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
12228  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
12229  }
12230}
12231
12232// isLegalAddressingMode - Return true if the addressing mode represented
12233// by AM is legal for this target, for a load/store of the specified type.
12234bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
12235                                              Type *Ty) const {
12236  // X86 supports extremely general addressing modes.
12237  CodeModel::Model M = getTargetMachine().getCodeModel();
12238  Reloc::Model R = getTargetMachine().getRelocationModel();
12239
12240  // X86 allows a sign-extended 32-bit immediate field as a displacement.
12241  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
12242    return false;
12243
12244  if (AM.BaseGV) {
12245    unsigned GVFlags =
12246      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
12247
12248    // If a reference to this global requires an extra load, we can't fold it.
12249    if (isGlobalStubReference(GVFlags))
12250      return false;
12251
12252    // If BaseGV requires a register for the PIC base, we cannot also have a
12253    // BaseReg specified.
12254    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
12255      return false;
12256
12257    // If lower 4G is not available, then we must use rip-relative addressing.
12258    if ((M != CodeModel::Small || R != Reloc::Static) &&
12259        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
12260      return false;
12261  }
12262
12263  switch (AM.Scale) {
12264  case 0:
12265  case 1:
12266  case 2:
12267  case 4:
12268  case 8:
12269    // These scales always work.
12270    break;
12271  case 3:
12272  case 5:
12273  case 9:
12274    // These scales are formed with basereg+scalereg.  Only accept if there is
12275    // no basereg yet.
12276    if (AM.HasBaseReg)
12277      return false;
12278    break;
12279  default:  // Other stuff never works.
12280    return false;
12281  }
12282
12283  return true;
12284}
12285
12286bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
12287  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
12288    return false;
12289  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
12290  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
12291  if (NumBits1 <= NumBits2)
12292    return false;
12293  return true;
12294}
12295
12296bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
12297  return Imm == (int32_t)Imm;
12298}
12299
12300bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
12301  // Can also use sub to handle negated immediates.
12302  return Imm == (int32_t)Imm;
12303}
12304
12305bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
12306  if (!VT1.isInteger() || !VT2.isInteger())
12307    return false;
12308  unsigned NumBits1 = VT1.getSizeInBits();
12309  unsigned NumBits2 = VT2.getSizeInBits();
12310  if (NumBits1 <= NumBits2)
12311    return false;
12312  return true;
12313}
12314
12315bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
12316  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
12317  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
12318}
12319
12320bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
12321  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
12322  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
12323}
12324
12325bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
12326  EVT VT1 = Val.getValueType();
12327  if (isZExtFree(VT1, VT2))
12328    return true;
12329
12330  if (Val.getOpcode() != ISD::LOAD)
12331    return false;
12332
12333  if (!VT1.isSimple() || !VT1.isInteger() ||
12334      !VT2.isSimple() || !VT2.isInteger())
12335    return false;
12336
12337  switch (VT1.getSimpleVT().SimpleTy) {
12338  default: break;
12339  case MVT::i8:
12340  case MVT::i16:
12341  case MVT::i32:
12342    // X86 has 8, 16, and 32-bit zero-extending loads.
12343    return true;
12344  }
12345
12346  return false;
12347}
12348
12349bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
12350  // i16 instructions are longer (0x66 prefix) and potentially slower.
12351  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
12352}
12353
12354/// isShuffleMaskLegal - Targets can use this to indicate that they only
12355/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
12356/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
12357/// are assumed to be legal.
12358bool
12359X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
12360                                      EVT VT) const {
12361  // Very little shuffling can be done for 64-bit vectors right now.
12362  if (VT.getSizeInBits() == 64)
12363    return false;
12364
12365  // FIXME: pshufb, blends, shifts.
12366  return (VT.getVectorNumElements() == 2 ||
12367          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
12368          isMOVLMask(M, VT) ||
12369          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
12370          isPSHUFDMask(M, VT) ||
12371          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
12372          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
12373          isPALIGNRMask(M, VT, Subtarget) ||
12374          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
12375          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
12376          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
12377          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
12378}
12379
12380bool
12381X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
12382                                          EVT VT) const {
12383  unsigned NumElts = VT.getVectorNumElements();
12384  // FIXME: This collection of masks seems suspect.
12385  if (NumElts == 2)
12386    return true;
12387  if (NumElts == 4 && VT.is128BitVector()) {
12388    return (isMOVLMask(Mask, VT)  ||
12389            isCommutedMOVLMask(Mask, VT, true) ||
12390            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
12391            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
12392  }
12393  return false;
12394}
12395
12396//===----------------------------------------------------------------------===//
12397//                           X86 Scheduler Hooks
12398//===----------------------------------------------------------------------===//
12399
12400/// Utility function to emit xbegin specifying the start of an RTM region.
12401static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
12402                                     const TargetInstrInfo *TII) {
12403  DebugLoc DL = MI->getDebugLoc();
12404
12405  const BasicBlock *BB = MBB->getBasicBlock();
12406  MachineFunction::iterator I = MBB;
12407  ++I;
12408
12409  // For the v = xbegin(), we generate
12410  //
12411  // thisMBB:
12412  //  xbegin sinkMBB
12413  //
12414  // mainMBB:
12415  //  eax = -1
12416  //
12417  // sinkMBB:
12418  //  v = eax
12419
12420  MachineBasicBlock *thisMBB = MBB;
12421  MachineFunction *MF = MBB->getParent();
12422  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12423  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12424  MF->insert(I, mainMBB);
12425  MF->insert(I, sinkMBB);
12426
12427  // Transfer the remainder of BB and its successor edges to sinkMBB.
12428  sinkMBB->splice(sinkMBB->begin(), MBB,
12429                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
12430  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12431
12432  // thisMBB:
12433  //  xbegin sinkMBB
12434  //  # fallthrough to mainMBB
12435  //  # abortion to sinkMBB
12436  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
12437  thisMBB->addSuccessor(mainMBB);
12438  thisMBB->addSuccessor(sinkMBB);
12439
12440  // mainMBB:
12441  //  EAX = -1
12442  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
12443  mainMBB->addSuccessor(sinkMBB);
12444
12445  // sinkMBB:
12446  // EAX is live into the sinkMBB
12447  sinkMBB->addLiveIn(X86::EAX);
12448  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12449          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
12450    .addReg(X86::EAX);
12451
12452  MI->eraseFromParent();
12453  return sinkMBB;
12454}
12455
12456// Get CMPXCHG opcode for the specified data type.
12457static unsigned getCmpXChgOpcode(EVT VT) {
12458  switch (VT.getSimpleVT().SimpleTy) {
12459  case MVT::i8:  return X86::LCMPXCHG8;
12460  case MVT::i16: return X86::LCMPXCHG16;
12461  case MVT::i32: return X86::LCMPXCHG32;
12462  case MVT::i64: return X86::LCMPXCHG64;
12463  default:
12464    break;
12465  }
12466  llvm_unreachable("Invalid operand size!");
12467}
12468
12469// Get LOAD opcode for the specified data type.
12470static unsigned getLoadOpcode(EVT VT) {
12471  switch (VT.getSimpleVT().SimpleTy) {
12472  case MVT::i8:  return X86::MOV8rm;
12473  case MVT::i16: return X86::MOV16rm;
12474  case MVT::i32: return X86::MOV32rm;
12475  case MVT::i64: return X86::MOV64rm;
12476  default:
12477    break;
12478  }
12479  llvm_unreachable("Invalid operand size!");
12480}
12481
12482// Get opcode of the non-atomic one from the specified atomic instruction.
12483static unsigned getNonAtomicOpcode(unsigned Opc) {
12484  switch (Opc) {
12485  case X86::ATOMAND8:  return X86::AND8rr;
12486  case X86::ATOMAND16: return X86::AND16rr;
12487  case X86::ATOMAND32: return X86::AND32rr;
12488  case X86::ATOMAND64: return X86::AND64rr;
12489  case X86::ATOMOR8:   return X86::OR8rr;
12490  case X86::ATOMOR16:  return X86::OR16rr;
12491  case X86::ATOMOR32:  return X86::OR32rr;
12492  case X86::ATOMOR64:  return X86::OR64rr;
12493  case X86::ATOMXOR8:  return X86::XOR8rr;
12494  case X86::ATOMXOR16: return X86::XOR16rr;
12495  case X86::ATOMXOR32: return X86::XOR32rr;
12496  case X86::ATOMXOR64: return X86::XOR64rr;
12497  }
12498  llvm_unreachable("Unhandled atomic-load-op opcode!");
12499}
12500
12501// Get opcode of the non-atomic one from the specified atomic instruction with
12502// extra opcode.
12503static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
12504                                               unsigned &ExtraOpc) {
12505  switch (Opc) {
12506  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
12507  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
12508  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
12509  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
12510  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
12511  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
12512  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
12513  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
12514  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
12515  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
12516  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
12517  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
12518  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
12519  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
12520  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
12521  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
12522  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
12523  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
12524  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
12525  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
12526  }
12527  llvm_unreachable("Unhandled atomic-load-op opcode!");
12528}
12529
12530// Get opcode of the non-atomic one from the specified atomic instruction for
12531// 64-bit data type on 32-bit target.
12532static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
12533  switch (Opc) {
12534  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
12535  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
12536  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
12537  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
12538  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
12539  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
12540  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
12541  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
12542  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
12543  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
12544  }
12545  llvm_unreachable("Unhandled atomic-load-op opcode!");
12546}
12547
12548// Get opcode of the non-atomic one from the specified atomic instruction for
12549// 64-bit data type on 32-bit target with extra opcode.
12550static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
12551                                                   unsigned &HiOpc,
12552                                                   unsigned &ExtraOpc) {
12553  switch (Opc) {
12554  case X86::ATOMNAND6432:
12555    ExtraOpc = X86::NOT32r;
12556    HiOpc = X86::AND32rr;
12557    return X86::AND32rr;
12558  }
12559  llvm_unreachable("Unhandled atomic-load-op opcode!");
12560}
12561
12562// Get pseudo CMOV opcode from the specified data type.
12563static unsigned getPseudoCMOVOpc(EVT VT) {
12564  switch (VT.getSimpleVT().SimpleTy) {
12565  case MVT::i8:  return X86::CMOV_GR8;
12566  case MVT::i16: return X86::CMOV_GR16;
12567  case MVT::i32: return X86::CMOV_GR32;
12568  default:
12569    break;
12570  }
12571  llvm_unreachable("Unknown CMOV opcode!");
12572}
12573
12574// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
12575// They will be translated into a spin-loop or compare-exchange loop from
12576//
12577//    ...
12578//    dst = atomic-fetch-op MI.addr, MI.val
12579//    ...
12580//
12581// to
12582//
12583//    ...
12584//    EAX = LOAD MI.addr
12585// loop:
12586//    t1 = OP MI.val, EAX
12587//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
12588//    JNE loop
12589// sink:
12590//    dst = EAX
12591//    ...
12592MachineBasicBlock *
12593X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
12594                                       MachineBasicBlock *MBB) const {
12595  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12596  DebugLoc DL = MI->getDebugLoc();
12597
12598  MachineFunction *MF = MBB->getParent();
12599  MachineRegisterInfo &MRI = MF->getRegInfo();
12600
12601  const BasicBlock *BB = MBB->getBasicBlock();
12602  MachineFunction::iterator I = MBB;
12603  ++I;
12604
12605  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
12606         "Unexpected number of operands");
12607
12608  assert(MI->hasOneMemOperand() &&
12609         "Expected atomic-load-op to have one memoperand");
12610
12611  // Memory Reference
12612  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
12613  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
12614
12615  unsigned DstReg, SrcReg;
12616  unsigned MemOpndSlot;
12617
12618  unsigned CurOp = 0;
12619
12620  DstReg = MI->getOperand(CurOp++).getReg();
12621  MemOpndSlot = CurOp;
12622  CurOp += X86::AddrNumOperands;
12623  SrcReg = MI->getOperand(CurOp++).getReg();
12624
12625  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
12626  MVT::SimpleValueType VT = *RC->vt_begin();
12627  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
12628
12629  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
12630  unsigned LOADOpc = getLoadOpcode(VT);
12631
12632  // For the atomic load-arith operator, we generate
12633  //
12634  //  thisMBB:
12635  //    EAX = LOAD [MI.addr]
12636  //  mainMBB:
12637  //    t1 = OP MI.val, EAX
12638  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
12639  //    JNE mainMBB
12640  //  sinkMBB:
12641
12642  MachineBasicBlock *thisMBB = MBB;
12643  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12644  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12645  MF->insert(I, mainMBB);
12646  MF->insert(I, sinkMBB);
12647
12648  MachineInstrBuilder MIB;
12649
12650  // Transfer the remainder of BB and its successor edges to sinkMBB.
12651  sinkMBB->splice(sinkMBB->begin(), MBB,
12652                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
12653  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12654
12655  // thisMBB:
12656  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
12657  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12658    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12659  MIB.setMemRefs(MMOBegin, MMOEnd);
12660
12661  thisMBB->addSuccessor(mainMBB);
12662
12663  // mainMBB:
12664  MachineBasicBlock *origMainMBB = mainMBB;
12665  mainMBB->addLiveIn(AccPhyReg);
12666
12667  // Copy AccPhyReg as it is used more than once.
12668  unsigned AccReg = MRI.createVirtualRegister(RC);
12669  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
12670    .addReg(AccPhyReg);
12671
12672  unsigned t1 = MRI.createVirtualRegister(RC);
12673  unsigned Opc = MI->getOpcode();
12674  switch (Opc) {
12675  default:
12676    llvm_unreachable("Unhandled atomic-load-op opcode!");
12677  case X86::ATOMAND8:
12678  case X86::ATOMAND16:
12679  case X86::ATOMAND32:
12680  case X86::ATOMAND64:
12681  case X86::ATOMOR8:
12682  case X86::ATOMOR16:
12683  case X86::ATOMOR32:
12684  case X86::ATOMOR64:
12685  case X86::ATOMXOR8:
12686  case X86::ATOMXOR16:
12687  case X86::ATOMXOR32:
12688  case X86::ATOMXOR64: {
12689    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
12690    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
12691      .addReg(AccReg);
12692    break;
12693  }
12694  case X86::ATOMNAND8:
12695  case X86::ATOMNAND16:
12696  case X86::ATOMNAND32:
12697  case X86::ATOMNAND64: {
12698    unsigned t2 = MRI.createVirtualRegister(RC);
12699    unsigned NOTOpc;
12700    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
12701    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
12702      .addReg(AccReg);
12703    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
12704    break;
12705  }
12706  case X86::ATOMMAX8:
12707  case X86::ATOMMAX16:
12708  case X86::ATOMMAX32:
12709  case X86::ATOMMAX64:
12710  case X86::ATOMMIN8:
12711  case X86::ATOMMIN16:
12712  case X86::ATOMMIN32:
12713  case X86::ATOMMIN64:
12714  case X86::ATOMUMAX8:
12715  case X86::ATOMUMAX16:
12716  case X86::ATOMUMAX32:
12717  case X86::ATOMUMAX64:
12718  case X86::ATOMUMIN8:
12719  case X86::ATOMUMIN16:
12720  case X86::ATOMUMIN32:
12721  case X86::ATOMUMIN64: {
12722    unsigned CMPOpc;
12723    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
12724
12725    BuildMI(mainMBB, DL, TII->get(CMPOpc))
12726      .addReg(SrcReg)
12727      .addReg(AccReg);
12728
12729    if (Subtarget->hasCMov()) {
12730      if (VT != MVT::i8) {
12731        // Native support
12732        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
12733          .addReg(SrcReg)
12734          .addReg(AccReg);
12735      } else {
12736        // Promote i8 to i32 to use CMOV32
12737        const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
12738        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
12739        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
12740        unsigned t2 = MRI.createVirtualRegister(RC32);
12741
12742        unsigned Undef = MRI.createVirtualRegister(RC32);
12743        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
12744
12745        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
12746          .addReg(Undef)
12747          .addReg(SrcReg)
12748          .addImm(X86::sub_8bit);
12749        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
12750          .addReg(Undef)
12751          .addReg(AccReg)
12752          .addImm(X86::sub_8bit);
12753
12754        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
12755          .addReg(SrcReg32)
12756          .addReg(AccReg32);
12757
12758        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
12759          .addReg(t2, 0, X86::sub_8bit);
12760      }
12761    } else {
12762      // Use pseudo select and lower them.
12763      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
12764             "Invalid atomic-load-op transformation!");
12765      unsigned SelOpc = getPseudoCMOVOpc(VT);
12766      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
12767      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
12768      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
12769              .addReg(SrcReg).addReg(AccReg)
12770              .addImm(CC);
12771      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12772    }
12773    break;
12774  }
12775  }
12776
12777  // Copy AccPhyReg back from virtual register.
12778  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
12779    .addReg(AccReg);
12780
12781  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
12782  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12783    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12784  MIB.addReg(t1);
12785  MIB.setMemRefs(MMOBegin, MMOEnd);
12786
12787  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
12788
12789  mainMBB->addSuccessor(origMainMBB);
12790  mainMBB->addSuccessor(sinkMBB);
12791
12792  // sinkMBB:
12793  sinkMBB->addLiveIn(AccPhyReg);
12794
12795  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12796          TII->get(TargetOpcode::COPY), DstReg)
12797    .addReg(AccPhyReg);
12798
12799  MI->eraseFromParent();
12800  return sinkMBB;
12801}
12802
12803// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
12804// instructions. They will be translated into a spin-loop or compare-exchange
12805// loop from
12806//
12807//    ...
12808//    dst = atomic-fetch-op MI.addr, MI.val
12809//    ...
12810//
12811// to
12812//
12813//    ...
12814//    EAX = LOAD [MI.addr + 0]
12815//    EDX = LOAD [MI.addr + 4]
12816// loop:
12817//    EBX = OP MI.val.lo, EAX
12818//    ECX = OP MI.val.hi, EDX
12819//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
12820//    JNE loop
12821// sink:
12822//    dst = EDX:EAX
12823//    ...
12824MachineBasicBlock *
12825X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
12826                                           MachineBasicBlock *MBB) const {
12827  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12828  DebugLoc DL = MI->getDebugLoc();
12829
12830  MachineFunction *MF = MBB->getParent();
12831  MachineRegisterInfo &MRI = MF->getRegInfo();
12832
12833  const BasicBlock *BB = MBB->getBasicBlock();
12834  MachineFunction::iterator I = MBB;
12835  ++I;
12836
12837  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
12838         "Unexpected number of operands");
12839
12840  assert(MI->hasOneMemOperand() &&
12841         "Expected atomic-load-op32 to have one memoperand");
12842
12843  // Memory Reference
12844  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
12845  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
12846
12847  unsigned DstLoReg, DstHiReg;
12848  unsigned SrcLoReg, SrcHiReg;
12849  unsigned MemOpndSlot;
12850
12851  unsigned CurOp = 0;
12852
12853  DstLoReg = MI->getOperand(CurOp++).getReg();
12854  DstHiReg = MI->getOperand(CurOp++).getReg();
12855  MemOpndSlot = CurOp;
12856  CurOp += X86::AddrNumOperands;
12857  SrcLoReg = MI->getOperand(CurOp++).getReg();
12858  SrcHiReg = MI->getOperand(CurOp++).getReg();
12859
12860  const TargetRegisterClass *RC = &X86::GR32RegClass;
12861  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
12862
12863  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
12864  unsigned LOADOpc = X86::MOV32rm;
12865
12866  // For the atomic load-arith operator, we generate
12867  //
12868  //  thisMBB:
12869  //    EAX = LOAD [MI.addr + 0]
12870  //    EDX = LOAD [MI.addr + 4]
12871  //  mainMBB:
12872  //    EBX = OP MI.vallo, EAX
12873  //    ECX = OP MI.valhi, EDX
12874  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
12875  //    JNE mainMBB
12876  //  sinkMBB:
12877
12878  MachineBasicBlock *thisMBB = MBB;
12879  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12880  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12881  MF->insert(I, mainMBB);
12882  MF->insert(I, sinkMBB);
12883
12884  MachineInstrBuilder MIB;
12885
12886  // Transfer the remainder of BB and its successor edges to sinkMBB.
12887  sinkMBB->splice(sinkMBB->begin(), MBB,
12888                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
12889  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12890
12891  // thisMBB:
12892  // Lo
12893  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
12894  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
12895    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12896  MIB.setMemRefs(MMOBegin, MMOEnd);
12897  // Hi
12898  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
12899  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
12900    if (i == X86::AddrDisp)
12901      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
12902    else
12903      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
12904  }
12905  MIB.setMemRefs(MMOBegin, MMOEnd);
12906
12907  thisMBB->addSuccessor(mainMBB);
12908
12909  // mainMBB:
12910  MachineBasicBlock *origMainMBB = mainMBB;
12911  mainMBB->addLiveIn(X86::EAX);
12912  mainMBB->addLiveIn(X86::EDX);
12913
12914  // Copy EDX:EAX as they are used more than once.
12915  unsigned LoReg = MRI.createVirtualRegister(RC);
12916  unsigned HiReg = MRI.createVirtualRegister(RC);
12917  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
12918  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
12919
12920  unsigned t1L = MRI.createVirtualRegister(RC);
12921  unsigned t1H = MRI.createVirtualRegister(RC);
12922
12923  unsigned Opc = MI->getOpcode();
12924  switch (Opc) {
12925  default:
12926    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
12927  case X86::ATOMAND6432:
12928  case X86::ATOMOR6432:
12929  case X86::ATOMXOR6432:
12930  case X86::ATOMADD6432:
12931  case X86::ATOMSUB6432: {
12932    unsigned HiOpc;
12933    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
12934    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
12935    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
12936    break;
12937  }
12938  case X86::ATOMNAND6432: {
12939    unsigned HiOpc, NOTOpc;
12940    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
12941    unsigned t2L = MRI.createVirtualRegister(RC);
12942    unsigned t2H = MRI.createVirtualRegister(RC);
12943    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
12944    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
12945    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
12946    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
12947    break;
12948  }
12949  case X86::ATOMMAX6432:
12950  case X86::ATOMMIN6432:
12951  case X86::ATOMUMAX6432:
12952  case X86::ATOMUMIN6432: {
12953    unsigned HiOpc;
12954    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
12955    unsigned cL = MRI.createVirtualRegister(RC8);
12956    unsigned cH = MRI.createVirtualRegister(RC8);
12957    unsigned cL32 = MRI.createVirtualRegister(RC);
12958    unsigned cH32 = MRI.createVirtualRegister(RC);
12959    unsigned cc = MRI.createVirtualRegister(RC);
12960    // cl := cmp src_lo, lo
12961    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
12962      .addReg(SrcLoReg).addReg(LoReg);
12963    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
12964    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
12965    // ch := cmp src_hi, hi
12966    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
12967      .addReg(SrcHiReg).addReg(HiReg);
12968    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
12969    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
12970    // cc := if (src_hi == hi) ? cl : ch;
12971    if (Subtarget->hasCMov()) {
12972      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
12973        .addReg(cH32).addReg(cL32);
12974    } else {
12975      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
12976              .addReg(cH32).addReg(cL32)
12977              .addImm(X86::COND_E);
12978      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12979    }
12980    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
12981    if (Subtarget->hasCMov()) {
12982      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
12983        .addReg(SrcLoReg).addReg(LoReg);
12984      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
12985        .addReg(SrcHiReg).addReg(HiReg);
12986    } else {
12987      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
12988              .addReg(SrcLoReg).addReg(LoReg)
12989              .addImm(X86::COND_NE);
12990      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12991      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
12992              .addReg(SrcHiReg).addReg(HiReg)
12993              .addImm(X86::COND_NE);
12994      mainMBB = EmitLoweredSelect(MIB, mainMBB);
12995    }
12996    break;
12997  }
12998  case X86::ATOMSWAP6432: {
12999    unsigned HiOpc;
13000    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
13001    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
13002    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
13003    break;
13004  }
13005  }
13006
13007  // Copy EDX:EAX back from HiReg:LoReg
13008  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
13009  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
13010  // Copy ECX:EBX from t1H:t1L
13011  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
13012  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
13013
13014  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
13015  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
13016    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
13017  MIB.setMemRefs(MMOBegin, MMOEnd);
13018
13019  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
13020
13021  mainMBB->addSuccessor(origMainMBB);
13022  mainMBB->addSuccessor(sinkMBB);
13023
13024  // sinkMBB:
13025  sinkMBB->addLiveIn(X86::EAX);
13026  sinkMBB->addLiveIn(X86::EDX);
13027
13028  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13029          TII->get(TargetOpcode::COPY), DstLoReg)
13030    .addReg(X86::EAX);
13031  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13032          TII->get(TargetOpcode::COPY), DstHiReg)
13033    .addReg(X86::EDX);
13034
13035  MI->eraseFromParent();
13036  return sinkMBB;
13037}
13038
13039// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
13040// or XMM0_V32I8 in AVX all of this code can be replaced with that
13041// in the .td file.
13042static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
13043                                       const TargetInstrInfo *TII) {
13044  unsigned Opc;
13045  switch (MI->getOpcode()) {
13046  default: llvm_unreachable("illegal opcode!");
13047  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
13048  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
13049  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
13050  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
13051  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
13052  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
13053  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
13054  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
13055  }
13056
13057  DebugLoc dl = MI->getDebugLoc();
13058  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
13059
13060  unsigned NumArgs = MI->getNumOperands();
13061  for (unsigned i = 1; i < NumArgs; ++i) {
13062    MachineOperand &Op = MI->getOperand(i);
13063    if (!(Op.isReg() && Op.isImplicit()))
13064      MIB.addOperand(Op);
13065  }
13066  if (MI->hasOneMemOperand())
13067    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
13068
13069  BuildMI(*BB, MI, dl,
13070    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
13071    .addReg(X86::XMM0);
13072
13073  MI->eraseFromParent();
13074  return BB;
13075}
13076
13077// FIXME: Custom handling because TableGen doesn't support multiple implicit
13078// defs in an instruction pattern
13079static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
13080                                       const TargetInstrInfo *TII) {
13081  unsigned Opc;
13082  switch (MI->getOpcode()) {
13083  default: llvm_unreachable("illegal opcode!");
13084  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
13085  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
13086  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
13087  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
13088  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
13089  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
13090  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
13091  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
13092  }
13093
13094  DebugLoc dl = MI->getDebugLoc();
13095  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
13096
13097  unsigned NumArgs = MI->getNumOperands(); // remove the results
13098  for (unsigned i = 1; i < NumArgs; ++i) {
13099    MachineOperand &Op = MI->getOperand(i);
13100    if (!(Op.isReg() && Op.isImplicit()))
13101      MIB.addOperand(Op);
13102  }
13103  if (MI->hasOneMemOperand())
13104    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
13105
13106  BuildMI(*BB, MI, dl,
13107    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
13108    .addReg(X86::ECX);
13109
13110  MI->eraseFromParent();
13111  return BB;
13112}
13113
13114static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
13115                                       const TargetInstrInfo *TII,
13116                                       const X86Subtarget* Subtarget) {
13117  DebugLoc dl = MI->getDebugLoc();
13118
13119  // Address into RAX/EAX, other two args into ECX, EDX.
13120  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
13121  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13122  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
13123  for (int i = 0; i < X86::AddrNumOperands; ++i)
13124    MIB.addOperand(MI->getOperand(i));
13125
13126  unsigned ValOps = X86::AddrNumOperands;
13127  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
13128    .addReg(MI->getOperand(ValOps).getReg());
13129  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
13130    .addReg(MI->getOperand(ValOps+1).getReg());
13131
13132  // The instruction doesn't actually take any operands though.
13133  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
13134
13135  MI->eraseFromParent(); // The pseudo is gone now.
13136  return BB;
13137}
13138
13139MachineBasicBlock *
13140X86TargetLowering::EmitVAARG64WithCustomInserter(
13141                   MachineInstr *MI,
13142                   MachineBasicBlock *MBB) const {
13143  // Emit va_arg instruction on X86-64.
13144
13145  // Operands to this pseudo-instruction:
13146  // 0  ) Output        : destination address (reg)
13147  // 1-5) Input         : va_list address (addr, i64mem)
13148  // 6  ) ArgSize       : Size (in bytes) of vararg type
13149  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
13150  // 8  ) Align         : Alignment of type
13151  // 9  ) EFLAGS (implicit-def)
13152
13153  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
13154  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
13155
13156  unsigned DestReg = MI->getOperand(0).getReg();
13157  MachineOperand &Base = MI->getOperand(1);
13158  MachineOperand &Scale = MI->getOperand(2);
13159  MachineOperand &Index = MI->getOperand(3);
13160  MachineOperand &Disp = MI->getOperand(4);
13161  MachineOperand &Segment = MI->getOperand(5);
13162  unsigned ArgSize = MI->getOperand(6).getImm();
13163  unsigned ArgMode = MI->getOperand(7).getImm();
13164  unsigned Align = MI->getOperand(8).getImm();
13165
13166  // Memory Reference
13167  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
13168  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13169  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13170
13171  // Machine Information
13172  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13173  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
13174  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
13175  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
13176  DebugLoc DL = MI->getDebugLoc();
13177
13178  // struct va_list {
13179  //   i32   gp_offset
13180  //   i32   fp_offset
13181  //   i64   overflow_area (address)
13182  //   i64   reg_save_area (address)
13183  // }
13184  // sizeof(va_list) = 24
13185  // alignment(va_list) = 8
13186
13187  unsigned TotalNumIntRegs = 6;
13188  unsigned TotalNumXMMRegs = 8;
13189  bool UseGPOffset = (ArgMode == 1);
13190  bool UseFPOffset = (ArgMode == 2);
13191  unsigned MaxOffset = TotalNumIntRegs * 8 +
13192                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
13193
13194  /* Align ArgSize to a multiple of 8 */
13195  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
13196  bool NeedsAlign = (Align > 8);
13197
13198  MachineBasicBlock *thisMBB = MBB;
13199  MachineBasicBlock *overflowMBB;
13200  MachineBasicBlock *offsetMBB;
13201  MachineBasicBlock *endMBB;
13202
13203  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
13204  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
13205  unsigned OffsetReg = 0;
13206
13207  if (!UseGPOffset && !UseFPOffset) {
13208    // If we only pull from the overflow region, we don't create a branch.
13209    // We don't need to alter control flow.
13210    OffsetDestReg = 0; // unused
13211    OverflowDestReg = DestReg;
13212
13213    offsetMBB = NULL;
13214    overflowMBB = thisMBB;
13215    endMBB = thisMBB;
13216  } else {
13217    // First emit code to check if gp_offset (or fp_offset) is below the bound.
13218    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
13219    // If not, pull from overflow_area. (branch to overflowMBB)
13220    //
13221    //       thisMBB
13222    //         |     .
13223    //         |        .
13224    //     offsetMBB   overflowMBB
13225    //         |        .
13226    //         |     .
13227    //        endMBB
13228
13229    // Registers for the PHI in endMBB
13230    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
13231    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
13232
13233    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
13234    MachineFunction *MF = MBB->getParent();
13235    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13236    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13237    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13238
13239    MachineFunction::iterator MBBIter = MBB;
13240    ++MBBIter;
13241
13242    // Insert the new basic blocks
13243    MF->insert(MBBIter, offsetMBB);
13244    MF->insert(MBBIter, overflowMBB);
13245    MF->insert(MBBIter, endMBB);
13246
13247    // Transfer the remainder of MBB and its successor edges to endMBB.
13248    endMBB->splice(endMBB->begin(), thisMBB,
13249                    llvm::next(MachineBasicBlock::iterator(MI)),
13250                    thisMBB->end());
13251    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
13252
13253    // Make offsetMBB and overflowMBB successors of thisMBB
13254    thisMBB->addSuccessor(offsetMBB);
13255    thisMBB->addSuccessor(overflowMBB);
13256
13257    // endMBB is a successor of both offsetMBB and overflowMBB
13258    offsetMBB->addSuccessor(endMBB);
13259    overflowMBB->addSuccessor(endMBB);
13260
13261    // Load the offset value into a register
13262    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
13263    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
13264      .addOperand(Base)
13265      .addOperand(Scale)
13266      .addOperand(Index)
13267      .addDisp(Disp, UseFPOffset ? 4 : 0)
13268      .addOperand(Segment)
13269      .setMemRefs(MMOBegin, MMOEnd);
13270
13271    // Check if there is enough room left to pull this argument.
13272    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
13273      .addReg(OffsetReg)
13274      .addImm(MaxOffset + 8 - ArgSizeA8);
13275
13276    // Branch to "overflowMBB" if offset >= max
13277    // Fall through to "offsetMBB" otherwise
13278    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
13279      .addMBB(overflowMBB);
13280  }
13281
13282  // In offsetMBB, emit code to use the reg_save_area.
13283  if (offsetMBB) {
13284    assert(OffsetReg != 0);
13285
13286    // Read the reg_save_area address.
13287    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
13288    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
13289      .addOperand(Base)
13290      .addOperand(Scale)
13291      .addOperand(Index)
13292      .addDisp(Disp, 16)
13293      .addOperand(Segment)
13294      .setMemRefs(MMOBegin, MMOEnd);
13295
13296    // Zero-extend the offset
13297    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
13298      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
13299        .addImm(0)
13300        .addReg(OffsetReg)
13301        .addImm(X86::sub_32bit);
13302
13303    // Add the offset to the reg_save_area to get the final address.
13304    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
13305      .addReg(OffsetReg64)
13306      .addReg(RegSaveReg);
13307
13308    // Compute the offset for the next argument
13309    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
13310    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
13311      .addReg(OffsetReg)
13312      .addImm(UseFPOffset ? 16 : 8);
13313
13314    // Store it back into the va_list.
13315    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
13316      .addOperand(Base)
13317      .addOperand(Scale)
13318      .addOperand(Index)
13319      .addDisp(Disp, UseFPOffset ? 4 : 0)
13320      .addOperand(Segment)
13321      .addReg(NextOffsetReg)
13322      .setMemRefs(MMOBegin, MMOEnd);
13323
13324    // Jump to endMBB
13325    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
13326      .addMBB(endMBB);
13327  }
13328
13329  //
13330  // Emit code to use overflow area
13331  //
13332
13333  // Load the overflow_area address into a register.
13334  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
13335  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
13336    .addOperand(Base)
13337    .addOperand(Scale)
13338    .addOperand(Index)
13339    .addDisp(Disp, 8)
13340    .addOperand(Segment)
13341    .setMemRefs(MMOBegin, MMOEnd);
13342
13343  // If we need to align it, do so. Otherwise, just copy the address
13344  // to OverflowDestReg.
13345  if (NeedsAlign) {
13346    // Align the overflow address
13347    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
13348    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
13349
13350    // aligned_addr = (addr + (align-1)) & ~(align-1)
13351    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
13352      .addReg(OverflowAddrReg)
13353      .addImm(Align-1);
13354
13355    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
13356      .addReg(TmpReg)
13357      .addImm(~(uint64_t)(Align-1));
13358  } else {
13359    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
13360      .addReg(OverflowAddrReg);
13361  }
13362
13363  // Compute the next overflow address after this argument.
13364  // (the overflow address should be kept 8-byte aligned)
13365  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
13366  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
13367    .addReg(OverflowDestReg)
13368    .addImm(ArgSizeA8);
13369
13370  // Store the new overflow address.
13371  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
13372    .addOperand(Base)
13373    .addOperand(Scale)
13374    .addOperand(Index)
13375    .addDisp(Disp, 8)
13376    .addOperand(Segment)
13377    .addReg(NextAddrReg)
13378    .setMemRefs(MMOBegin, MMOEnd);
13379
13380  // If we branched, emit the PHI to the front of endMBB.
13381  if (offsetMBB) {
13382    BuildMI(*endMBB, endMBB->begin(), DL,
13383            TII->get(X86::PHI), DestReg)
13384      .addReg(OffsetDestReg).addMBB(offsetMBB)
13385      .addReg(OverflowDestReg).addMBB(overflowMBB);
13386  }
13387
13388  // Erase the pseudo instruction
13389  MI->eraseFromParent();
13390
13391  return endMBB;
13392}
13393
13394MachineBasicBlock *
13395X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
13396                                                 MachineInstr *MI,
13397                                                 MachineBasicBlock *MBB) const {
13398  // Emit code to save XMM registers to the stack. The ABI says that the
13399  // number of registers to save is given in %al, so it's theoretically
13400  // possible to do an indirect jump trick to avoid saving all of them,
13401  // however this code takes a simpler approach and just executes all
13402  // of the stores if %al is non-zero. It's less code, and it's probably
13403  // easier on the hardware branch predictor, and stores aren't all that
13404  // expensive anyway.
13405
13406  // Create the new basic blocks. One block contains all the XMM stores,
13407  // and one block is the final destination regardless of whether any
13408  // stores were performed.
13409  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
13410  MachineFunction *F = MBB->getParent();
13411  MachineFunction::iterator MBBIter = MBB;
13412  ++MBBIter;
13413  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
13414  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
13415  F->insert(MBBIter, XMMSaveMBB);
13416  F->insert(MBBIter, EndMBB);
13417
13418  // Transfer the remainder of MBB and its successor edges to EndMBB.
13419  EndMBB->splice(EndMBB->begin(), MBB,
13420                 llvm::next(MachineBasicBlock::iterator(MI)),
13421                 MBB->end());
13422  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
13423
13424  // The original block will now fall through to the XMM save block.
13425  MBB->addSuccessor(XMMSaveMBB);
13426  // The XMMSaveMBB will fall through to the end block.
13427  XMMSaveMBB->addSuccessor(EndMBB);
13428
13429  // Now add the instructions.
13430  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13431  DebugLoc DL = MI->getDebugLoc();
13432
13433  unsigned CountReg = MI->getOperand(0).getReg();
13434  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
13435  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
13436
13437  if (!Subtarget->isTargetWin64()) {
13438    // If %al is 0, branch around the XMM save block.
13439    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
13440    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
13441    MBB->addSuccessor(EndMBB);
13442  }
13443
13444  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
13445  // In the XMM save block, save all the XMM argument registers.
13446  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
13447    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
13448    MachineMemOperand *MMO =
13449      F->getMachineMemOperand(
13450          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
13451        MachineMemOperand::MOStore,
13452        /*Size=*/16, /*Align=*/16);
13453    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
13454      .addFrameIndex(RegSaveFrameIndex)
13455      .addImm(/*Scale=*/1)
13456      .addReg(/*IndexReg=*/0)
13457      .addImm(/*Disp=*/Offset)
13458      .addReg(/*Segment=*/0)
13459      .addReg(MI->getOperand(i).getReg())
13460      .addMemOperand(MMO);
13461  }
13462
13463  MI->eraseFromParent();   // The pseudo instruction is gone now.
13464
13465  return EndMBB;
13466}
13467
13468// The EFLAGS operand of SelectItr might be missing a kill marker
13469// because there were multiple uses of EFLAGS, and ISel didn't know
13470// which to mark. Figure out whether SelectItr should have had a
13471// kill marker, and set it if it should. Returns the correct kill
13472// marker value.
13473static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
13474                                     MachineBasicBlock* BB,
13475                                     const TargetRegisterInfo* TRI) {
13476  // Scan forward through BB for a use/def of EFLAGS.
13477  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
13478  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
13479    const MachineInstr& mi = *miI;
13480    if (mi.readsRegister(X86::EFLAGS))
13481      return false;
13482    if (mi.definesRegister(X86::EFLAGS))
13483      break; // Should have kill-flag - update below.
13484  }
13485
13486  // If we hit the end of the block, check whether EFLAGS is live into a
13487  // successor.
13488  if (miI == BB->end()) {
13489    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
13490                                          sEnd = BB->succ_end();
13491         sItr != sEnd; ++sItr) {
13492      MachineBasicBlock* succ = *sItr;
13493      if (succ->isLiveIn(X86::EFLAGS))
13494        return false;
13495    }
13496  }
13497
13498  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
13499  // out. SelectMI should have a kill flag on EFLAGS.
13500  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
13501  return true;
13502}
13503
13504MachineBasicBlock *
13505X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
13506                                     MachineBasicBlock *BB) const {
13507  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13508  DebugLoc DL = MI->getDebugLoc();
13509
13510  // To "insert" a SELECT_CC instruction, we actually have to insert the
13511  // diamond control-flow pattern.  The incoming instruction knows the
13512  // destination vreg to set, the condition code register to branch on, the
13513  // true/false values to select between, and a branch opcode to use.
13514  const BasicBlock *LLVM_BB = BB->getBasicBlock();
13515  MachineFunction::iterator It = BB;
13516  ++It;
13517
13518  //  thisMBB:
13519  //  ...
13520  //   TrueVal = ...
13521  //   cmpTY ccX, r1, r2
13522  //   bCC copy1MBB
13523  //   fallthrough --> copy0MBB
13524  MachineBasicBlock *thisMBB = BB;
13525  MachineFunction *F = BB->getParent();
13526  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
13527  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
13528  F->insert(It, copy0MBB);
13529  F->insert(It, sinkMBB);
13530
13531  // If the EFLAGS register isn't dead in the terminator, then claim that it's
13532  // live into the sink and copy blocks.
13533  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
13534  if (!MI->killsRegister(X86::EFLAGS) &&
13535      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
13536    copy0MBB->addLiveIn(X86::EFLAGS);
13537    sinkMBB->addLiveIn(X86::EFLAGS);
13538  }
13539
13540  // Transfer the remainder of BB and its successor edges to sinkMBB.
13541  sinkMBB->splice(sinkMBB->begin(), BB,
13542                  llvm::next(MachineBasicBlock::iterator(MI)),
13543                  BB->end());
13544  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
13545
13546  // Add the true and fallthrough blocks as its successors.
13547  BB->addSuccessor(copy0MBB);
13548  BB->addSuccessor(sinkMBB);
13549
13550  // Create the conditional branch instruction.
13551  unsigned Opc =
13552    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
13553  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
13554
13555  //  copy0MBB:
13556  //   %FalseValue = ...
13557  //   # fallthrough to sinkMBB
13558  copy0MBB->addSuccessor(sinkMBB);
13559
13560  //  sinkMBB:
13561  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
13562  //  ...
13563  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13564          TII->get(X86::PHI), MI->getOperand(0).getReg())
13565    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
13566    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
13567
13568  MI->eraseFromParent();   // The pseudo instruction is gone now.
13569  return sinkMBB;
13570}
13571
13572MachineBasicBlock *
13573X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
13574                                        bool Is64Bit) const {
13575  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13576  DebugLoc DL = MI->getDebugLoc();
13577  MachineFunction *MF = BB->getParent();
13578  const BasicBlock *LLVM_BB = BB->getBasicBlock();
13579
13580  assert(getTargetMachine().Options.EnableSegmentedStacks);
13581
13582  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
13583  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
13584
13585  // BB:
13586  //  ... [Till the alloca]
13587  // If stacklet is not large enough, jump to mallocMBB
13588  //
13589  // bumpMBB:
13590  //  Allocate by subtracting from RSP
13591  //  Jump to continueMBB
13592  //
13593  // mallocMBB:
13594  //  Allocate by call to runtime
13595  //
13596  // continueMBB:
13597  //  ...
13598  //  [rest of original BB]
13599  //
13600
13601  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13602  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13603  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
13604
13605  MachineRegisterInfo &MRI = MF->getRegInfo();
13606  const TargetRegisterClass *AddrRegClass =
13607    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
13608
13609  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
13610    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
13611    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
13612    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
13613    sizeVReg = MI->getOperand(1).getReg(),
13614    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
13615
13616  MachineFunction::iterator MBBIter = BB;
13617  ++MBBIter;
13618
13619  MF->insert(MBBIter, bumpMBB);
13620  MF->insert(MBBIter, mallocMBB);
13621  MF->insert(MBBIter, continueMBB);
13622
13623  continueMBB->splice(continueMBB->begin(), BB, llvm::next
13624                      (MachineBasicBlock::iterator(MI)), BB->end());
13625  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
13626
13627  // Add code to the main basic block to check if the stack limit has been hit,
13628  // and if so, jump to mallocMBB otherwise to bumpMBB.
13629  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
13630  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
13631    .addReg(tmpSPVReg).addReg(sizeVReg);
13632  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
13633    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
13634    .addReg(SPLimitVReg);
13635  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
13636
13637  // bumpMBB simply decreases the stack pointer, since we know the current
13638  // stacklet has enough space.
13639  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
13640    .addReg(SPLimitVReg);
13641  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
13642    .addReg(SPLimitVReg);
13643  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
13644
13645  // Calls into a routine in libgcc to allocate more space from the heap.
13646  const uint32_t *RegMask =
13647    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
13648  if (Is64Bit) {
13649    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
13650      .addReg(sizeVReg);
13651    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
13652      .addExternalSymbol("__morestack_allocate_stack_space")
13653      .addRegMask(RegMask)
13654      .addReg(X86::RDI, RegState::Implicit)
13655      .addReg(X86::RAX, RegState::ImplicitDefine);
13656  } else {
13657    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
13658      .addImm(12);
13659    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
13660    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
13661      .addExternalSymbol("__morestack_allocate_stack_space")
13662      .addRegMask(RegMask)
13663      .addReg(X86::EAX, RegState::ImplicitDefine);
13664  }
13665
13666  if (!Is64Bit)
13667    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
13668      .addImm(16);
13669
13670  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
13671    .addReg(Is64Bit ? X86::RAX : X86::EAX);
13672  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
13673
13674  // Set up the CFG correctly.
13675  BB->addSuccessor(bumpMBB);
13676  BB->addSuccessor(mallocMBB);
13677  mallocMBB->addSuccessor(continueMBB);
13678  bumpMBB->addSuccessor(continueMBB);
13679
13680  // Take care of the PHI nodes.
13681  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
13682          MI->getOperand(0).getReg())
13683    .addReg(mallocPtrVReg).addMBB(mallocMBB)
13684    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
13685
13686  // Delete the original pseudo instruction.
13687  MI->eraseFromParent();
13688
13689  // And we're done.
13690  return continueMBB;
13691}
13692
13693MachineBasicBlock *
13694X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
13695                                          MachineBasicBlock *BB) const {
13696  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13697  DebugLoc DL = MI->getDebugLoc();
13698
13699  assert(!Subtarget->isTargetEnvMacho());
13700
13701  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
13702  // non-trivial part is impdef of ESP.
13703
13704  if (Subtarget->isTargetWin64()) {
13705    if (Subtarget->isTargetCygMing()) {
13706      // ___chkstk(Mingw64):
13707      // Clobbers R10, R11, RAX and EFLAGS.
13708      // Updates RSP.
13709      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
13710        .addExternalSymbol("___chkstk")
13711        .addReg(X86::RAX, RegState::Implicit)
13712        .addReg(X86::RSP, RegState::Implicit)
13713        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
13714        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
13715        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
13716    } else {
13717      // __chkstk(MSVCRT): does not update stack pointer.
13718      // Clobbers R10, R11 and EFLAGS.
13719      // FIXME: RAX(allocated size) might be reused and not killed.
13720      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
13721        .addExternalSymbol("__chkstk")
13722        .addReg(X86::RAX, RegState::Implicit)
13723        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
13724      // RAX has the offset to subtracted from RSP.
13725      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
13726        .addReg(X86::RSP)
13727        .addReg(X86::RAX);
13728    }
13729  } else {
13730    const char *StackProbeSymbol =
13731      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
13732
13733    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
13734      .addExternalSymbol(StackProbeSymbol)
13735      .addReg(X86::EAX, RegState::Implicit)
13736      .addReg(X86::ESP, RegState::Implicit)
13737      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
13738      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
13739      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
13740  }
13741
13742  MI->eraseFromParent();   // The pseudo instruction is gone now.
13743  return BB;
13744}
13745
13746MachineBasicBlock *
13747X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
13748                                      MachineBasicBlock *BB) const {
13749  // This is pretty easy.  We're taking the value that we received from
13750  // our load from the relocation, sticking it in either RDI (x86-64)
13751  // or EAX and doing an indirect call.  The return value will then
13752  // be in the normal return register.
13753  const X86InstrInfo *TII
13754    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
13755  DebugLoc DL = MI->getDebugLoc();
13756  MachineFunction *F = BB->getParent();
13757
13758  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
13759  assert(MI->getOperand(3).isGlobal() && "This should be a global");
13760
13761  // Get a register mask for the lowered call.
13762  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
13763  // proper register mask.
13764  const uint32_t *RegMask =
13765    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
13766  if (Subtarget->is64Bit()) {
13767    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
13768                                      TII->get(X86::MOV64rm), X86::RDI)
13769    .addReg(X86::RIP)
13770    .addImm(0).addReg(0)
13771    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
13772                      MI->getOperand(3).getTargetFlags())
13773    .addReg(0);
13774    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
13775    addDirectMem(MIB, X86::RDI);
13776    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
13777  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
13778    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
13779                                      TII->get(X86::MOV32rm), X86::EAX)
13780    .addReg(0)
13781    .addImm(0).addReg(0)
13782    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
13783                      MI->getOperand(3).getTargetFlags())
13784    .addReg(0);
13785    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
13786    addDirectMem(MIB, X86::EAX);
13787    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
13788  } else {
13789    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
13790                                      TII->get(X86::MOV32rm), X86::EAX)
13791    .addReg(TII->getGlobalBaseReg(F))
13792    .addImm(0).addReg(0)
13793    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
13794                      MI->getOperand(3).getTargetFlags())
13795    .addReg(0);
13796    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
13797    addDirectMem(MIB, X86::EAX);
13798    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
13799  }
13800
13801  MI->eraseFromParent(); // The pseudo instruction is gone now.
13802  return BB;
13803}
13804
13805MachineBasicBlock *
13806X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
13807                                    MachineBasicBlock *MBB) const {
13808  DebugLoc DL = MI->getDebugLoc();
13809  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13810
13811  MachineFunction *MF = MBB->getParent();
13812  MachineRegisterInfo &MRI = MF->getRegInfo();
13813
13814  const BasicBlock *BB = MBB->getBasicBlock();
13815  MachineFunction::iterator I = MBB;
13816  ++I;
13817
13818  // Memory Reference
13819  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13820  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13821
13822  unsigned DstReg;
13823  unsigned MemOpndSlot = 0;
13824
13825  unsigned CurOp = 0;
13826
13827  DstReg = MI->getOperand(CurOp++).getReg();
13828  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
13829  assert(RC->hasType(MVT::i32) && "Invalid destination!");
13830  unsigned mainDstReg = MRI.createVirtualRegister(RC);
13831  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
13832
13833  MemOpndSlot = CurOp;
13834
13835  MVT PVT = getPointerTy();
13836  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13837         "Invalid Pointer Size!");
13838
13839  // For v = setjmp(buf), we generate
13840  //
13841  // thisMBB:
13842  //  buf[LabelOffset] = restoreMBB
13843  //  SjLjSetup restoreMBB
13844  //
13845  // mainMBB:
13846  //  v_main = 0
13847  //
13848  // sinkMBB:
13849  //  v = phi(main, restore)
13850  //
13851  // restoreMBB:
13852  //  v_restore = 1
13853
13854  MachineBasicBlock *thisMBB = MBB;
13855  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
13856  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
13857  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
13858  MF->insert(I, mainMBB);
13859  MF->insert(I, sinkMBB);
13860  MF->push_back(restoreMBB);
13861
13862  MachineInstrBuilder MIB;
13863
13864  // Transfer the remainder of BB and its successor edges to sinkMBB.
13865  sinkMBB->splice(sinkMBB->begin(), MBB,
13866                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
13867  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
13868
13869  // thisMBB:
13870  unsigned PtrStoreOpc = 0;
13871  unsigned LabelReg = 0;
13872  const int64_t LabelOffset = 1 * PVT.getStoreSize();
13873  Reloc::Model RM = getTargetMachine().getRelocationModel();
13874  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
13875                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
13876
13877  // Prepare IP either in reg or imm.
13878  if (!UseImmLabel) {
13879    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
13880    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
13881    LabelReg = MRI.createVirtualRegister(PtrRC);
13882    if (Subtarget->is64Bit()) {
13883      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
13884              .addReg(X86::RIP)
13885              .addImm(0)
13886              .addReg(0)
13887              .addMBB(restoreMBB)
13888              .addReg(0);
13889    } else {
13890      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
13891      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
13892              .addReg(XII->getGlobalBaseReg(MF))
13893              .addImm(0)
13894              .addReg(0)
13895              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
13896              .addReg(0);
13897    }
13898  } else
13899    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
13900  // Store IP
13901  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
13902  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13903    if (i == X86::AddrDisp)
13904      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
13905    else
13906      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
13907  }
13908  if (!UseImmLabel)
13909    MIB.addReg(LabelReg);
13910  else
13911    MIB.addMBB(restoreMBB);
13912  MIB.setMemRefs(MMOBegin, MMOEnd);
13913  // Setup
13914  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
13915          .addMBB(restoreMBB);
13916  MIB.addRegMask(RegInfo->getNoPreservedMask());
13917  thisMBB->addSuccessor(mainMBB);
13918  thisMBB->addSuccessor(restoreMBB);
13919
13920  // mainMBB:
13921  //  EAX = 0
13922  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
13923  mainMBB->addSuccessor(sinkMBB);
13924
13925  // sinkMBB:
13926  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
13927          TII->get(X86::PHI), DstReg)
13928    .addReg(mainDstReg).addMBB(mainMBB)
13929    .addReg(restoreDstReg).addMBB(restoreMBB);
13930
13931  // restoreMBB:
13932  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
13933  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
13934  restoreMBB->addSuccessor(sinkMBB);
13935
13936  MI->eraseFromParent();
13937  return sinkMBB;
13938}
13939
13940MachineBasicBlock *
13941X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
13942                                     MachineBasicBlock *MBB) const {
13943  DebugLoc DL = MI->getDebugLoc();
13944  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
13945
13946  MachineFunction *MF = MBB->getParent();
13947  MachineRegisterInfo &MRI = MF->getRegInfo();
13948
13949  // Memory Reference
13950  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
13951  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
13952
13953  MVT PVT = getPointerTy();
13954  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
13955         "Invalid Pointer Size!");
13956
13957  const TargetRegisterClass *RC =
13958    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
13959  unsigned Tmp = MRI.createVirtualRegister(RC);
13960  // Since FP is only updated here but NOT referenced, it's treated as GPR.
13961  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
13962  unsigned SP = RegInfo->getStackRegister();
13963
13964  MachineInstrBuilder MIB;
13965
13966  const int64_t LabelOffset = 1 * PVT.getStoreSize();
13967  const int64_t SPOffset = 2 * PVT.getStoreSize();
13968
13969  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
13970  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
13971
13972  // Reload FP
13973  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
13974  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
13975    MIB.addOperand(MI->getOperand(i));
13976  MIB.setMemRefs(MMOBegin, MMOEnd);
13977  // Reload IP
13978  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
13979  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13980    if (i == X86::AddrDisp)
13981      MIB.addDisp(MI->getOperand(i), LabelOffset);
13982    else
13983      MIB.addOperand(MI->getOperand(i));
13984  }
13985  MIB.setMemRefs(MMOBegin, MMOEnd);
13986  // Reload SP
13987  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
13988  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
13989    if (i == X86::AddrDisp)
13990      MIB.addDisp(MI->getOperand(i), SPOffset);
13991    else
13992      MIB.addOperand(MI->getOperand(i));
13993  }
13994  MIB.setMemRefs(MMOBegin, MMOEnd);
13995  // Jump
13996  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
13997
13998  MI->eraseFromParent();
13999  return MBB;
14000}
14001
14002MachineBasicBlock *
14003X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
14004                                               MachineBasicBlock *BB) const {
14005  switch (MI->getOpcode()) {
14006  default: llvm_unreachable("Unexpected instr type to insert");
14007  case X86::TAILJMPd64:
14008  case X86::TAILJMPr64:
14009  case X86::TAILJMPm64:
14010    llvm_unreachable("TAILJMP64 would not be touched here.");
14011  case X86::TCRETURNdi64:
14012  case X86::TCRETURNri64:
14013  case X86::TCRETURNmi64:
14014    return BB;
14015  case X86::WIN_ALLOCA:
14016    return EmitLoweredWinAlloca(MI, BB);
14017  case X86::SEG_ALLOCA_32:
14018    return EmitLoweredSegAlloca(MI, BB, false);
14019  case X86::SEG_ALLOCA_64:
14020    return EmitLoweredSegAlloca(MI, BB, true);
14021  case X86::TLSCall_32:
14022  case X86::TLSCall_64:
14023    return EmitLoweredTLSCall(MI, BB);
14024  case X86::CMOV_GR8:
14025  case X86::CMOV_FR32:
14026  case X86::CMOV_FR64:
14027  case X86::CMOV_V4F32:
14028  case X86::CMOV_V2F64:
14029  case X86::CMOV_V2I64:
14030  case X86::CMOV_V8F32:
14031  case X86::CMOV_V4F64:
14032  case X86::CMOV_V4I64:
14033  case X86::CMOV_GR16:
14034  case X86::CMOV_GR32:
14035  case X86::CMOV_RFP32:
14036  case X86::CMOV_RFP64:
14037  case X86::CMOV_RFP80:
14038    return EmitLoweredSelect(MI, BB);
14039
14040  case X86::FP32_TO_INT16_IN_MEM:
14041  case X86::FP32_TO_INT32_IN_MEM:
14042  case X86::FP32_TO_INT64_IN_MEM:
14043  case X86::FP64_TO_INT16_IN_MEM:
14044  case X86::FP64_TO_INT32_IN_MEM:
14045  case X86::FP64_TO_INT64_IN_MEM:
14046  case X86::FP80_TO_INT16_IN_MEM:
14047  case X86::FP80_TO_INT32_IN_MEM:
14048  case X86::FP80_TO_INT64_IN_MEM: {
14049    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
14050    DebugLoc DL = MI->getDebugLoc();
14051
14052    // Change the floating point control register to use "round towards zero"
14053    // mode when truncating to an integer value.
14054    MachineFunction *F = BB->getParent();
14055    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
14056    addFrameReference(BuildMI(*BB, MI, DL,
14057                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
14058
14059    // Load the old value of the high byte of the control word...
14060    unsigned OldCW =
14061      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
14062    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
14063                      CWFrameIdx);
14064
14065    // Set the high part to be round to zero...
14066    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
14067      .addImm(0xC7F);
14068
14069    // Reload the modified control word now...
14070    addFrameReference(BuildMI(*BB, MI, DL,
14071                              TII->get(X86::FLDCW16m)), CWFrameIdx);
14072
14073    // Restore the memory image of control word to original value
14074    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
14075      .addReg(OldCW);
14076
14077    // Get the X86 opcode to use.
14078    unsigned Opc;
14079    switch (MI->getOpcode()) {
14080    default: llvm_unreachable("illegal opcode!");
14081    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
14082    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
14083    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
14084    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
14085    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
14086    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
14087    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
14088    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
14089    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
14090    }
14091
14092    X86AddressMode AM;
14093    MachineOperand &Op = MI->getOperand(0);
14094    if (Op.isReg()) {
14095      AM.BaseType = X86AddressMode::RegBase;
14096      AM.Base.Reg = Op.getReg();
14097    } else {
14098      AM.BaseType = X86AddressMode::FrameIndexBase;
14099      AM.Base.FrameIndex = Op.getIndex();
14100    }
14101    Op = MI->getOperand(1);
14102    if (Op.isImm())
14103      AM.Scale = Op.getImm();
14104    Op = MI->getOperand(2);
14105    if (Op.isImm())
14106      AM.IndexReg = Op.getImm();
14107    Op = MI->getOperand(3);
14108    if (Op.isGlobal()) {
14109      AM.GV = Op.getGlobal();
14110    } else {
14111      AM.Disp = Op.getImm();
14112    }
14113    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
14114                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
14115
14116    // Reload the original control word now.
14117    addFrameReference(BuildMI(*BB, MI, DL,
14118                              TII->get(X86::FLDCW16m)), CWFrameIdx);
14119
14120    MI->eraseFromParent();   // The pseudo instruction is gone now.
14121    return BB;
14122  }
14123    // String/text processing lowering.
14124  case X86::PCMPISTRM128REG:
14125  case X86::VPCMPISTRM128REG:
14126  case X86::PCMPISTRM128MEM:
14127  case X86::VPCMPISTRM128MEM:
14128  case X86::PCMPESTRM128REG:
14129  case X86::VPCMPESTRM128REG:
14130  case X86::PCMPESTRM128MEM:
14131  case X86::VPCMPESTRM128MEM:
14132    assert(Subtarget->hasSSE42() &&
14133           "Target must have SSE4.2 or AVX features enabled");
14134    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
14135
14136  // String/text processing lowering.
14137  case X86::PCMPISTRIREG:
14138  case X86::VPCMPISTRIREG:
14139  case X86::PCMPISTRIMEM:
14140  case X86::VPCMPISTRIMEM:
14141  case X86::PCMPESTRIREG:
14142  case X86::VPCMPESTRIREG:
14143  case X86::PCMPESTRIMEM:
14144  case X86::VPCMPESTRIMEM:
14145    assert(Subtarget->hasSSE42() &&
14146           "Target must have SSE4.2 or AVX features enabled");
14147    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
14148
14149  // Thread synchronization.
14150  case X86::MONITOR:
14151    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
14152
14153  // xbegin
14154  case X86::XBEGIN:
14155    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
14156
14157  // Atomic Lowering.
14158  case X86::ATOMAND8:
14159  case X86::ATOMAND16:
14160  case X86::ATOMAND32:
14161  case X86::ATOMAND64:
14162    // Fall through
14163  case X86::ATOMOR8:
14164  case X86::ATOMOR16:
14165  case X86::ATOMOR32:
14166  case X86::ATOMOR64:
14167    // Fall through
14168  case X86::ATOMXOR16:
14169  case X86::ATOMXOR8:
14170  case X86::ATOMXOR32:
14171  case X86::ATOMXOR64:
14172    // Fall through
14173  case X86::ATOMNAND8:
14174  case X86::ATOMNAND16:
14175  case X86::ATOMNAND32:
14176  case X86::ATOMNAND64:
14177    // Fall through
14178  case X86::ATOMMAX8:
14179  case X86::ATOMMAX16:
14180  case X86::ATOMMAX32:
14181  case X86::ATOMMAX64:
14182    // Fall through
14183  case X86::ATOMMIN8:
14184  case X86::ATOMMIN16:
14185  case X86::ATOMMIN32:
14186  case X86::ATOMMIN64:
14187    // Fall through
14188  case X86::ATOMUMAX8:
14189  case X86::ATOMUMAX16:
14190  case X86::ATOMUMAX32:
14191  case X86::ATOMUMAX64:
14192    // Fall through
14193  case X86::ATOMUMIN8:
14194  case X86::ATOMUMIN16:
14195  case X86::ATOMUMIN32:
14196  case X86::ATOMUMIN64:
14197    return EmitAtomicLoadArith(MI, BB);
14198
14199  // This group does 64-bit operations on a 32-bit host.
14200  case X86::ATOMAND6432:
14201  case X86::ATOMOR6432:
14202  case X86::ATOMXOR6432:
14203  case X86::ATOMNAND6432:
14204  case X86::ATOMADD6432:
14205  case X86::ATOMSUB6432:
14206  case X86::ATOMMAX6432:
14207  case X86::ATOMMIN6432:
14208  case X86::ATOMUMAX6432:
14209  case X86::ATOMUMIN6432:
14210  case X86::ATOMSWAP6432:
14211    return EmitAtomicLoadArith6432(MI, BB);
14212
14213  case X86::VASTART_SAVE_XMM_REGS:
14214    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
14215
14216  case X86::VAARG_64:
14217    return EmitVAARG64WithCustomInserter(MI, BB);
14218
14219  case X86::EH_SjLj_SetJmp32:
14220  case X86::EH_SjLj_SetJmp64:
14221    return emitEHSjLjSetJmp(MI, BB);
14222
14223  case X86::EH_SjLj_LongJmp32:
14224  case X86::EH_SjLj_LongJmp64:
14225    return emitEHSjLjLongJmp(MI, BB);
14226  }
14227}
14228
14229//===----------------------------------------------------------------------===//
14230//                           X86 Optimization Hooks
14231//===----------------------------------------------------------------------===//
14232
14233void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
14234                                                       APInt &KnownZero,
14235                                                       APInt &KnownOne,
14236                                                       const SelectionDAG &DAG,
14237                                                       unsigned Depth) const {
14238  unsigned BitWidth = KnownZero.getBitWidth();
14239  unsigned Opc = Op.getOpcode();
14240  assert((Opc >= ISD::BUILTIN_OP_END ||
14241          Opc == ISD::INTRINSIC_WO_CHAIN ||
14242          Opc == ISD::INTRINSIC_W_CHAIN ||
14243          Opc == ISD::INTRINSIC_VOID) &&
14244         "Should use MaskedValueIsZero if you don't know whether Op"
14245         " is a target node!");
14246
14247  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
14248  switch (Opc) {
14249  default: break;
14250  case X86ISD::ADD:
14251  case X86ISD::SUB:
14252  case X86ISD::ADC:
14253  case X86ISD::SBB:
14254  case X86ISD::SMUL:
14255  case X86ISD::UMUL:
14256  case X86ISD::INC:
14257  case X86ISD::DEC:
14258  case X86ISD::OR:
14259  case X86ISD::XOR:
14260  case X86ISD::AND:
14261    // These nodes' second result is a boolean.
14262    if (Op.getResNo() == 0)
14263      break;
14264    // Fallthrough
14265  case X86ISD::SETCC:
14266    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
14267    break;
14268  case ISD::INTRINSIC_WO_CHAIN: {
14269    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
14270    unsigned NumLoBits = 0;
14271    switch (IntId) {
14272    default: break;
14273    case Intrinsic::x86_sse_movmsk_ps:
14274    case Intrinsic::x86_avx_movmsk_ps_256:
14275    case Intrinsic::x86_sse2_movmsk_pd:
14276    case Intrinsic::x86_avx_movmsk_pd_256:
14277    case Intrinsic::x86_mmx_pmovmskb:
14278    case Intrinsic::x86_sse2_pmovmskb_128:
14279    case Intrinsic::x86_avx2_pmovmskb: {
14280      // High bits of movmskp{s|d}, pmovmskb are known zero.
14281      switch (IntId) {
14282        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
14283        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
14284        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
14285        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
14286        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
14287        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
14288        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
14289        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
14290      }
14291      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
14292      break;
14293    }
14294    }
14295    break;
14296  }
14297  }
14298}
14299
14300unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
14301                                                         unsigned Depth) const {
14302  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
14303  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
14304    return Op.getValueType().getScalarType().getSizeInBits();
14305
14306  // Fallback case.
14307  return 1;
14308}
14309
14310/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
14311/// node is a GlobalAddress + offset.
14312bool X86TargetLowering::isGAPlusOffset(SDNode *N,
14313                                       const GlobalValue* &GA,
14314                                       int64_t &Offset) const {
14315  if (N->getOpcode() == X86ISD::Wrapper) {
14316    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
14317      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
14318      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
14319      return true;
14320    }
14321  }
14322  return TargetLowering::isGAPlusOffset(N, GA, Offset);
14323}
14324
14325/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
14326/// same as extracting the high 128-bit part of 256-bit vector and then
14327/// inserting the result into the low part of a new 256-bit vector
14328static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
14329  EVT VT = SVOp->getValueType(0);
14330  unsigned NumElems = VT.getVectorNumElements();
14331
14332  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
14333  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
14334    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
14335        SVOp->getMaskElt(j) >= 0)
14336      return false;
14337
14338  return true;
14339}
14340
14341/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
14342/// same as extracting the low 128-bit part of 256-bit vector and then
14343/// inserting the result into the high part of a new 256-bit vector
14344static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
14345  EVT VT = SVOp->getValueType(0);
14346  unsigned NumElems = VT.getVectorNumElements();
14347
14348  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
14349  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
14350    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
14351        SVOp->getMaskElt(j) >= 0)
14352      return false;
14353
14354  return true;
14355}
14356
14357/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
14358static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
14359                                        TargetLowering::DAGCombinerInfo &DCI,
14360                                        const X86Subtarget* Subtarget) {
14361  DebugLoc dl = N->getDebugLoc();
14362  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
14363  SDValue V1 = SVOp->getOperand(0);
14364  SDValue V2 = SVOp->getOperand(1);
14365  EVT VT = SVOp->getValueType(0);
14366  unsigned NumElems = VT.getVectorNumElements();
14367
14368  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
14369      V2.getOpcode() == ISD::CONCAT_VECTORS) {
14370    //
14371    //                   0,0,0,...
14372    //                      |
14373    //    V      UNDEF    BUILD_VECTOR    UNDEF
14374    //     \      /           \           /
14375    //  CONCAT_VECTOR         CONCAT_VECTOR
14376    //         \                  /
14377    //          \                /
14378    //          RESULT: V + zero extended
14379    //
14380    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
14381        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
14382        V1.getOperand(1).getOpcode() != ISD::UNDEF)
14383      return SDValue();
14384
14385    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
14386      return SDValue();
14387
14388    // To match the shuffle mask, the first half of the mask should
14389    // be exactly the first vector, and all the rest a splat with the
14390    // first element of the second one.
14391    for (unsigned i = 0; i != NumElems/2; ++i)
14392      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
14393          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
14394        return SDValue();
14395
14396    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
14397    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
14398      if (Ld->hasNUsesOfValue(1, 0)) {
14399        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
14400        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
14401        SDValue ResNode =
14402          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
14403                                  Ld->getMemoryVT(),
14404                                  Ld->getPointerInfo(),
14405                                  Ld->getAlignment(),
14406                                  false/*isVolatile*/, true/*ReadMem*/,
14407                                  false/*WriteMem*/);
14408
14409        // Make sure the newly-created LOAD is in the same position as Ld in
14410        // terms of dependency. We create a TokenFactor for Ld and ResNode,
14411        // and update uses of Ld's output chain to use the TokenFactor.
14412        if (Ld->hasAnyUseOfValue(1)) {
14413          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
14414                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
14415          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
14416          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
14417                                 SDValue(ResNode.getNode(), 1));
14418        }
14419
14420        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
14421      }
14422    }
14423
14424    // Emit a zeroed vector and insert the desired subvector on its
14425    // first half.
14426    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
14427    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
14428    return DCI.CombineTo(N, InsV);
14429  }
14430
14431  //===--------------------------------------------------------------------===//
14432  // Combine some shuffles into subvector extracts and inserts:
14433  //
14434
14435  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
14436  if (isShuffleHigh128VectorInsertLow(SVOp)) {
14437    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
14438    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
14439    return DCI.CombineTo(N, InsV);
14440  }
14441
14442  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
14443  if (isShuffleLow128VectorInsertHigh(SVOp)) {
14444    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
14445    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
14446    return DCI.CombineTo(N, InsV);
14447  }
14448
14449  return SDValue();
14450}
14451
14452/// PerformShuffleCombine - Performs several different shuffle combines.
14453static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
14454                                     TargetLowering::DAGCombinerInfo &DCI,
14455                                     const X86Subtarget *Subtarget) {
14456  DebugLoc dl = N->getDebugLoc();
14457  EVT VT = N->getValueType(0);
14458
14459  // Don't create instructions with illegal types after legalize types has run.
14460  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14461  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
14462    return SDValue();
14463
14464  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
14465  if (Subtarget->hasFp256() && VT.is256BitVector() &&
14466      N->getOpcode() == ISD::VECTOR_SHUFFLE)
14467    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
14468
14469  // Only handle 128 wide vector from here on.
14470  if (!VT.is128BitVector())
14471    return SDValue();
14472
14473  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
14474  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
14475  // consecutive, non-overlapping, and in the right order.
14476  SmallVector<SDValue, 16> Elts;
14477  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
14478    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
14479
14480  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
14481}
14482
14483/// PerformTruncateCombine - Converts truncate operation to
14484/// a sequence of vector shuffle operations.
14485/// It is possible when we truncate 256-bit vector to 128-bit vector
14486static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
14487                                      TargetLowering::DAGCombinerInfo &DCI,
14488                                      const X86Subtarget *Subtarget)  {
14489  return SDValue();
14490}
14491
14492/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
14493/// specific shuffle of a load can be folded into a single element load.
14494/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
14495/// shuffles have been customed lowered so we need to handle those here.
14496static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
14497                                         TargetLowering::DAGCombinerInfo &DCI) {
14498  if (DCI.isBeforeLegalizeOps())
14499    return SDValue();
14500
14501  SDValue InVec = N->getOperand(0);
14502  SDValue EltNo = N->getOperand(1);
14503
14504  if (!isa<ConstantSDNode>(EltNo))
14505    return SDValue();
14506
14507  EVT VT = InVec.getValueType();
14508
14509  bool HasShuffleIntoBitcast = false;
14510  if (InVec.getOpcode() == ISD::BITCAST) {
14511    // Don't duplicate a load with other uses.
14512    if (!InVec.hasOneUse())
14513      return SDValue();
14514    EVT BCVT = InVec.getOperand(0).getValueType();
14515    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
14516      return SDValue();
14517    InVec = InVec.getOperand(0);
14518    HasShuffleIntoBitcast = true;
14519  }
14520
14521  if (!isTargetShuffle(InVec.getOpcode()))
14522    return SDValue();
14523
14524  // Don't duplicate a load with other uses.
14525  if (!InVec.hasOneUse())
14526    return SDValue();
14527
14528  SmallVector<int, 16> ShuffleMask;
14529  bool UnaryShuffle;
14530  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
14531                            UnaryShuffle))
14532    return SDValue();
14533
14534  // Select the input vector, guarding against out of range extract vector.
14535  unsigned NumElems = VT.getVectorNumElements();
14536  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
14537  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
14538  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
14539                                         : InVec.getOperand(1);
14540
14541  // If inputs to shuffle are the same for both ops, then allow 2 uses
14542  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
14543
14544  if (LdNode.getOpcode() == ISD::BITCAST) {
14545    // Don't duplicate a load with other uses.
14546    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
14547      return SDValue();
14548
14549    AllowedUses = 1; // only allow 1 load use if we have a bitcast
14550    LdNode = LdNode.getOperand(0);
14551  }
14552
14553  if (!ISD::isNormalLoad(LdNode.getNode()))
14554    return SDValue();
14555
14556  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
14557
14558  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
14559    return SDValue();
14560
14561  if (HasShuffleIntoBitcast) {
14562    // If there's a bitcast before the shuffle, check if the load type and
14563    // alignment is valid.
14564    unsigned Align = LN0->getAlignment();
14565    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14566    unsigned NewAlign = TLI.getDataLayout()->
14567      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
14568
14569    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
14570      return SDValue();
14571  }
14572
14573  // All checks match so transform back to vector_shuffle so that DAG combiner
14574  // can finish the job
14575  DebugLoc dl = N->getDebugLoc();
14576
14577  // Create shuffle node taking into account the case that its a unary shuffle
14578  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
14579  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
14580                                 InVec.getOperand(0), Shuffle,
14581                                 &ShuffleMask[0]);
14582  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
14583  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
14584                     EltNo);
14585}
14586
14587/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
14588/// generation and convert it from being a bunch of shuffles and extracts
14589/// to a simple store and scalar loads to extract the elements.
14590static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
14591                                         TargetLowering::DAGCombinerInfo &DCI) {
14592  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
14593  if (NewOp.getNode())
14594    return NewOp;
14595
14596  SDValue InputVector = N->getOperand(0);
14597  // Detect whether we are trying to convert from mmx to i32 and the bitcast
14598  // from mmx to v2i32 has a single usage.
14599  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
14600      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
14601      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
14602    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
14603                       N->getValueType(0),
14604                       InputVector.getNode()->getOperand(0));
14605
14606  // Only operate on vectors of 4 elements, where the alternative shuffling
14607  // gets to be more expensive.
14608  if (InputVector.getValueType() != MVT::v4i32)
14609    return SDValue();
14610
14611  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
14612  // single use which is a sign-extend or zero-extend, and all elements are
14613  // used.
14614  SmallVector<SDNode *, 4> Uses;
14615  unsigned ExtractedElements = 0;
14616  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
14617       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
14618    if (UI.getUse().getResNo() != InputVector.getResNo())
14619      return SDValue();
14620
14621    SDNode *Extract = *UI;
14622    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14623      return SDValue();
14624
14625    if (Extract->getValueType(0) != MVT::i32)
14626      return SDValue();
14627    if (!Extract->hasOneUse())
14628      return SDValue();
14629    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
14630        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
14631      return SDValue();
14632    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
14633      return SDValue();
14634
14635    // Record which element was extracted.
14636    ExtractedElements |=
14637      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
14638
14639    Uses.push_back(Extract);
14640  }
14641
14642  // If not all the elements were used, this may not be worthwhile.
14643  if (ExtractedElements != 15)
14644    return SDValue();
14645
14646  // Ok, we've now decided to do the transformation.
14647  DebugLoc dl = InputVector.getDebugLoc();
14648
14649  // Store the value to a temporary stack slot.
14650  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
14651  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
14652                            MachinePointerInfo(), false, false, 0);
14653
14654  // Replace each use (extract) with a load of the appropriate element.
14655  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
14656       UE = Uses.end(); UI != UE; ++UI) {
14657    SDNode *Extract = *UI;
14658
14659    // cOMpute the element's address.
14660    SDValue Idx = Extract->getOperand(1);
14661    unsigned EltSize =
14662        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
14663    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
14664    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14665    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
14666
14667    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
14668                                     StackPtr, OffsetVal);
14669
14670    // Load the scalar.
14671    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
14672                                     ScalarAddr, MachinePointerInfo(),
14673                                     false, false, false, 0);
14674
14675    // Replace the exact with the load.
14676    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
14677  }
14678
14679  // The replacement was made in place; don't return anything.
14680  return SDValue();
14681}
14682
14683/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
14684static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
14685                                   SDValue RHS, SelectionDAG &DAG,
14686                                   const X86Subtarget *Subtarget) {
14687  if (!VT.isVector())
14688    return 0;
14689
14690  switch (VT.getSimpleVT().SimpleTy) {
14691  default: return 0;
14692  case MVT::v32i8:
14693  case MVT::v16i16:
14694  case MVT::v8i32:
14695    if (!Subtarget->hasAVX2())
14696      return 0;
14697  case MVT::v16i8:
14698  case MVT::v8i16:
14699  case MVT::v4i32:
14700    if (!Subtarget->hasSSE2())
14701      return 0;
14702  }
14703
14704  // SSE2 has only a small subset of the operations.
14705  bool hasUnsigned = Subtarget->hasSSE41() ||
14706                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
14707  bool hasSigned = Subtarget->hasSSE41() ||
14708                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
14709
14710  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14711
14712  // Check for x CC y ? x : y.
14713  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
14714      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
14715    switch (CC) {
14716    default: break;
14717    case ISD::SETULT:
14718    case ISD::SETULE:
14719      return hasUnsigned ? X86ISD::UMIN : 0;
14720    case ISD::SETUGT:
14721    case ISD::SETUGE:
14722      return hasUnsigned ? X86ISD::UMAX : 0;
14723    case ISD::SETLT:
14724    case ISD::SETLE:
14725      return hasSigned ? X86ISD::SMIN : 0;
14726    case ISD::SETGT:
14727    case ISD::SETGE:
14728      return hasSigned ? X86ISD::SMAX : 0;
14729    }
14730  // Check for x CC y ? y : x -- a min/max with reversed arms.
14731  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
14732             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
14733    switch (CC) {
14734    default: break;
14735    case ISD::SETULT:
14736    case ISD::SETULE:
14737      return hasUnsigned ? X86ISD::UMAX : 0;
14738    case ISD::SETUGT:
14739    case ISD::SETUGE:
14740      return hasUnsigned ? X86ISD::UMIN : 0;
14741    case ISD::SETLT:
14742    case ISD::SETLE:
14743      return hasSigned ? X86ISD::SMAX : 0;
14744    case ISD::SETGT:
14745    case ISD::SETGE:
14746      return hasSigned ? X86ISD::SMIN : 0;
14747    }
14748  }
14749
14750  return 0;
14751}
14752
14753/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
14754/// nodes.
14755static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
14756                                    TargetLowering::DAGCombinerInfo &DCI,
14757                                    const X86Subtarget *Subtarget) {
14758  DebugLoc DL = N->getDebugLoc();
14759  SDValue Cond = N->getOperand(0);
14760  // Get the LHS/RHS of the select.
14761  SDValue LHS = N->getOperand(1);
14762  SDValue RHS = N->getOperand(2);
14763  EVT VT = LHS.getValueType();
14764
14765  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
14766  // instructions match the semantics of the common C idiom x<y?x:y but not
14767  // x<=y?x:y, because of how they handle negative zero (which can be
14768  // ignored in unsafe-math mode).
14769  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
14770      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
14771      (Subtarget->hasSSE2() ||
14772       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
14773    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14774
14775    unsigned Opcode = 0;
14776    // Check for x CC y ? x : y.
14777    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
14778        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
14779      switch (CC) {
14780      default: break;
14781      case ISD::SETULT:
14782        // Converting this to a min would handle NaNs incorrectly, and swapping
14783        // the operands would cause it to handle comparisons between positive
14784        // and negative zero incorrectly.
14785        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
14786          if (!DAG.getTarget().Options.UnsafeFPMath &&
14787              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
14788            break;
14789          std::swap(LHS, RHS);
14790        }
14791        Opcode = X86ISD::FMIN;
14792        break;
14793      case ISD::SETOLE:
14794        // Converting this to a min would handle comparisons between positive
14795        // and negative zero incorrectly.
14796        if (!DAG.getTarget().Options.UnsafeFPMath &&
14797            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
14798          break;
14799        Opcode = X86ISD::FMIN;
14800        break;
14801      case ISD::SETULE:
14802        // Converting this to a min would handle both negative zeros and NaNs
14803        // incorrectly, but we can swap the operands to fix both.
14804        std::swap(LHS, RHS);
14805      case ISD::SETOLT:
14806      case ISD::SETLT:
14807      case ISD::SETLE:
14808        Opcode = X86ISD::FMIN;
14809        break;
14810
14811      case ISD::SETOGE:
14812        // Converting this to a max would handle comparisons between positive
14813        // and negative zero incorrectly.
14814        if (!DAG.getTarget().Options.UnsafeFPMath &&
14815            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
14816          break;
14817        Opcode = X86ISD::FMAX;
14818        break;
14819      case ISD::SETUGT:
14820        // Converting this to a max would handle NaNs incorrectly, and swapping
14821        // the operands would cause it to handle comparisons between positive
14822        // and negative zero incorrectly.
14823        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
14824          if (!DAG.getTarget().Options.UnsafeFPMath &&
14825              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
14826            break;
14827          std::swap(LHS, RHS);
14828        }
14829        Opcode = X86ISD::FMAX;
14830        break;
14831      case ISD::SETUGE:
14832        // Converting this to a max would handle both negative zeros and NaNs
14833        // incorrectly, but we can swap the operands to fix both.
14834        std::swap(LHS, RHS);
14835      case ISD::SETOGT:
14836      case ISD::SETGT:
14837      case ISD::SETGE:
14838        Opcode = X86ISD::FMAX;
14839        break;
14840      }
14841    // Check for x CC y ? y : x -- a min/max with reversed arms.
14842    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
14843               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
14844      switch (CC) {
14845      default: break;
14846      case ISD::SETOGE:
14847        // Converting this to a min would handle comparisons between positive
14848        // and negative zero incorrectly, and swapping the operands would
14849        // cause it to handle NaNs incorrectly.
14850        if (!DAG.getTarget().Options.UnsafeFPMath &&
14851            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
14852          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
14853            break;
14854          std::swap(LHS, RHS);
14855        }
14856        Opcode = X86ISD::FMIN;
14857        break;
14858      case ISD::SETUGT:
14859        // Converting this to a min would handle NaNs incorrectly.
14860        if (!DAG.getTarget().Options.UnsafeFPMath &&
14861            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
14862          break;
14863        Opcode = X86ISD::FMIN;
14864        break;
14865      case ISD::SETUGE:
14866        // Converting this to a min would handle both negative zeros and NaNs
14867        // incorrectly, but we can swap the operands to fix both.
14868        std::swap(LHS, RHS);
14869      case ISD::SETOGT:
14870      case ISD::SETGT:
14871      case ISD::SETGE:
14872        Opcode = X86ISD::FMIN;
14873        break;
14874
14875      case ISD::SETULT:
14876        // Converting this to a max would handle NaNs incorrectly.
14877        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
14878          break;
14879        Opcode = X86ISD::FMAX;
14880        break;
14881      case ISD::SETOLE:
14882        // Converting this to a max would handle comparisons between positive
14883        // and negative zero incorrectly, and swapping the operands would
14884        // cause it to handle NaNs incorrectly.
14885        if (!DAG.getTarget().Options.UnsafeFPMath &&
14886            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
14887          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
14888            break;
14889          std::swap(LHS, RHS);
14890        }
14891        Opcode = X86ISD::FMAX;
14892        break;
14893      case ISD::SETULE:
14894        // Converting this to a max would handle both negative zeros and NaNs
14895        // incorrectly, but we can swap the operands to fix both.
14896        std::swap(LHS, RHS);
14897      case ISD::SETOLT:
14898      case ISD::SETLT:
14899      case ISD::SETLE:
14900        Opcode = X86ISD::FMAX;
14901        break;
14902      }
14903    }
14904
14905    if (Opcode)
14906      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
14907  }
14908
14909  // If this is a select between two integer constants, try to do some
14910  // optimizations.
14911  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
14912    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
14913      // Don't do this for crazy integer types.
14914      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
14915        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
14916        // so that TrueC (the true value) is larger than FalseC.
14917        bool NeedsCondInvert = false;
14918
14919        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
14920            // Efficiently invertible.
14921            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
14922             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
14923              isa<ConstantSDNode>(Cond.getOperand(1))))) {
14924          NeedsCondInvert = true;
14925          std::swap(TrueC, FalseC);
14926        }
14927
14928        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
14929        if (FalseC->getAPIntValue() == 0 &&
14930            TrueC->getAPIntValue().isPowerOf2()) {
14931          if (NeedsCondInvert) // Invert the condition if needed.
14932            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14933                               DAG.getConstant(1, Cond.getValueType()));
14934
14935          // Zero extend the condition if needed.
14936          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
14937
14938          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
14939          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
14940                             DAG.getConstant(ShAmt, MVT::i8));
14941        }
14942
14943        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
14944        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
14945          if (NeedsCondInvert) // Invert the condition if needed.
14946            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14947                               DAG.getConstant(1, Cond.getValueType()));
14948
14949          // Zero extend the condition if needed.
14950          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
14951                             FalseC->getValueType(0), Cond);
14952          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14953                             SDValue(FalseC, 0));
14954        }
14955
14956        // Optimize cases that will turn into an LEA instruction.  This requires
14957        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
14958        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
14959          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
14960          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
14961
14962          bool isFastMultiplier = false;
14963          if (Diff < 10) {
14964            switch ((unsigned char)Diff) {
14965              default: break;
14966              case 1:  // result = add base, cond
14967              case 2:  // result = lea base(    , cond*2)
14968              case 3:  // result = lea base(cond, cond*2)
14969              case 4:  // result = lea base(    , cond*4)
14970              case 5:  // result = lea base(cond, cond*4)
14971              case 8:  // result = lea base(    , cond*8)
14972              case 9:  // result = lea base(cond, cond*8)
14973                isFastMultiplier = true;
14974                break;
14975            }
14976          }
14977
14978          if (isFastMultiplier) {
14979            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
14980            if (NeedsCondInvert) // Invert the condition if needed.
14981              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14982                                 DAG.getConstant(1, Cond.getValueType()));
14983
14984            // Zero extend the condition if needed.
14985            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
14986                               Cond);
14987            // Scale the condition by the difference.
14988            if (Diff != 1)
14989              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
14990                                 DAG.getConstant(Diff, Cond.getValueType()));
14991
14992            // Add the base if non-zero.
14993            if (FalseC->getAPIntValue() != 0)
14994              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14995                                 SDValue(FalseC, 0));
14996            return Cond;
14997          }
14998        }
14999      }
15000  }
15001
15002  // Canonicalize max and min:
15003  // (x > y) ? x : y -> (x >= y) ? x : y
15004  // (x < y) ? x : y -> (x <= y) ? x : y
15005  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
15006  // the need for an extra compare
15007  // against zero. e.g.
15008  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
15009  // subl   %esi, %edi
15010  // testl  %edi, %edi
15011  // movl   $0, %eax
15012  // cmovgl %edi, %eax
15013  // =>
15014  // xorl   %eax, %eax
15015  // subl   %esi, $edi
15016  // cmovsl %eax, %edi
15017  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
15018      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
15019      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
15020    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
15021    switch (CC) {
15022    default: break;
15023    case ISD::SETLT:
15024    case ISD::SETGT: {
15025      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
15026      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
15027                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
15028      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
15029    }
15030    }
15031  }
15032
15033  // Match VSELECTs into subs with unsigned saturation.
15034  if (!DCI.isBeforeLegalize() &&
15035      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
15036      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
15037      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
15038       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
15039    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
15040
15041    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
15042    // left side invert the predicate to simplify logic below.
15043    SDValue Other;
15044    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
15045      Other = RHS;
15046      CC = ISD::getSetCCInverse(CC, true);
15047    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
15048      Other = LHS;
15049    }
15050
15051    if (Other.getNode() && Other->getNumOperands() == 2 &&
15052        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
15053      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
15054      SDValue CondRHS = Cond->getOperand(1);
15055
15056      // Look for a general sub with unsigned saturation first.
15057      // x >= y ? x-y : 0 --> subus x, y
15058      // x >  y ? x-y : 0 --> subus x, y
15059      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
15060          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
15061        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
15062
15063      // If the RHS is a constant we have to reverse the const canonicalization.
15064      // x > C-1 ? x+-C : 0 --> subus x, C
15065      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
15066          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
15067        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
15068        if (CondRHS.getConstantOperandVal(0) == -A-1) {
15069          SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
15070                                     DAG.getConstant(-A, VT.getScalarType()));
15071          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
15072                             DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
15073                                         V.data(), V.size()));
15074        }
15075      }
15076
15077      // Another special case: If C was a sign bit, the sub has been
15078      // canonicalized into a xor.
15079      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
15080      //        it's safe to decanonicalize the xor?
15081      // x s< 0 ? x^C : 0 --> subus x, C
15082      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
15083          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
15084          isSplatVector(OpRHS.getNode())) {
15085        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
15086        if (A.isSignBit())
15087          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
15088      }
15089    }
15090  }
15091
15092  // Try to match a min/max vector operation.
15093  if (!DCI.isBeforeLegalize() &&
15094      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
15095    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
15096      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
15097
15098  // If we know that this node is legal then we know that it is going to be
15099  // matched by one of the SSE/AVX BLEND instructions. These instructions only
15100  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
15101  // to simplify previous instructions.
15102  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15103  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
15104      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
15105    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
15106
15107    // Don't optimize vector selects that map to mask-registers.
15108    if (BitWidth == 1)
15109      return SDValue();
15110
15111    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
15112    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
15113
15114    APInt KnownZero, KnownOne;
15115    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
15116                                          DCI.isBeforeLegalizeOps());
15117    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
15118        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
15119      DCI.CommitTargetLoweringOpt(TLO);
15120  }
15121
15122  return SDValue();
15123}
15124
15125// Check whether a boolean test is testing a boolean value generated by
15126// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
15127// code.
15128//
15129// Simplify the following patterns:
15130// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
15131// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
15132// to (Op EFLAGS Cond)
15133//
15134// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
15135// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
15136// to (Op EFLAGS !Cond)
15137//
15138// where Op could be BRCOND or CMOV.
15139//
15140static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
15141  // Quit if not CMP and SUB with its value result used.
15142  if (Cmp.getOpcode() != X86ISD::CMP &&
15143      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
15144      return SDValue();
15145
15146  // Quit if not used as a boolean value.
15147  if (CC != X86::COND_E && CC != X86::COND_NE)
15148    return SDValue();
15149
15150  // Check CMP operands. One of them should be 0 or 1 and the other should be
15151  // an SetCC or extended from it.
15152  SDValue Op1 = Cmp.getOperand(0);
15153  SDValue Op2 = Cmp.getOperand(1);
15154
15155  SDValue SetCC;
15156  const ConstantSDNode* C = 0;
15157  bool needOppositeCond = (CC == X86::COND_E);
15158
15159  if ((C = dyn_cast<ConstantSDNode>(Op1)))
15160    SetCC = Op2;
15161  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
15162    SetCC = Op1;
15163  else // Quit if all operands are not constants.
15164    return SDValue();
15165
15166  if (C->getZExtValue() == 1)
15167    needOppositeCond = !needOppositeCond;
15168  else if (C->getZExtValue() != 0)
15169    // Quit if the constant is neither 0 or 1.
15170    return SDValue();
15171
15172  // Skip 'zext' node.
15173  if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
15174    SetCC = SetCC.getOperand(0);
15175
15176  switch (SetCC.getOpcode()) {
15177  case X86ISD::SETCC:
15178    // Set the condition code or opposite one if necessary.
15179    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
15180    if (needOppositeCond)
15181      CC = X86::GetOppositeBranchCondition(CC);
15182    return SetCC.getOperand(1);
15183  case X86ISD::CMOV: {
15184    // Check whether false/true value has canonical one, i.e. 0 or 1.
15185    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
15186    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
15187    // Quit if true value is not a constant.
15188    if (!TVal)
15189      return SDValue();
15190    // Quit if false value is not a constant.
15191    if (!FVal) {
15192      // A special case for rdrand, where 0 is set if false cond is found.
15193      SDValue Op = SetCC.getOperand(0);
15194      if (Op.getOpcode() != X86ISD::RDRAND)
15195        return SDValue();
15196    }
15197    // Quit if false value is not the constant 0 or 1.
15198    bool FValIsFalse = true;
15199    if (FVal && FVal->getZExtValue() != 0) {
15200      if (FVal->getZExtValue() != 1)
15201        return SDValue();
15202      // If FVal is 1, opposite cond is needed.
15203      needOppositeCond = !needOppositeCond;
15204      FValIsFalse = false;
15205    }
15206    // Quit if TVal is not the constant opposite of FVal.
15207    if (FValIsFalse && TVal->getZExtValue() != 1)
15208      return SDValue();
15209    if (!FValIsFalse && TVal->getZExtValue() != 0)
15210      return SDValue();
15211    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
15212    if (needOppositeCond)
15213      CC = X86::GetOppositeBranchCondition(CC);
15214    return SetCC.getOperand(3);
15215  }
15216  }
15217
15218  return SDValue();
15219}
15220
15221/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
15222static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
15223                                  TargetLowering::DAGCombinerInfo &DCI,
15224                                  const X86Subtarget *Subtarget) {
15225  DebugLoc DL = N->getDebugLoc();
15226
15227  // If the flag operand isn't dead, don't touch this CMOV.
15228  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
15229    return SDValue();
15230
15231  SDValue FalseOp = N->getOperand(0);
15232  SDValue TrueOp = N->getOperand(1);
15233  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
15234  SDValue Cond = N->getOperand(3);
15235
15236  if (CC == X86::COND_E || CC == X86::COND_NE) {
15237    switch (Cond.getOpcode()) {
15238    default: break;
15239    case X86ISD::BSR:
15240    case X86ISD::BSF:
15241      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
15242      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
15243        return (CC == X86::COND_E) ? FalseOp : TrueOp;
15244    }
15245  }
15246
15247  SDValue Flags;
15248
15249  Flags = checkBoolTestSetCCCombine(Cond, CC);
15250  if (Flags.getNode() &&
15251      // Extra check as FCMOV only supports a subset of X86 cond.
15252      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
15253    SDValue Ops[] = { FalseOp, TrueOp,
15254                      DAG.getConstant(CC, MVT::i8), Flags };
15255    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
15256                       Ops, array_lengthof(Ops));
15257  }
15258
15259  // If this is a select between two integer constants, try to do some
15260  // optimizations.  Note that the operands are ordered the opposite of SELECT
15261  // operands.
15262  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
15263    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
15264      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
15265      // larger than FalseC (the false value).
15266      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
15267        CC = X86::GetOppositeBranchCondition(CC);
15268        std::swap(TrueC, FalseC);
15269        std::swap(TrueOp, FalseOp);
15270      }
15271
15272      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
15273      // This is efficient for any integer data type (including i8/i16) and
15274      // shift amount.
15275      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
15276        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15277                           DAG.getConstant(CC, MVT::i8), Cond);
15278
15279        // Zero extend the condition if needed.
15280        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
15281
15282        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
15283        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
15284                           DAG.getConstant(ShAmt, MVT::i8));
15285        if (N->getNumValues() == 2)  // Dead flag value?
15286          return DCI.CombineTo(N, Cond, SDValue());
15287        return Cond;
15288      }
15289
15290      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
15291      // for any integer data type, including i8/i16.
15292      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
15293        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15294                           DAG.getConstant(CC, MVT::i8), Cond);
15295
15296        // Zero extend the condition if needed.
15297        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
15298                           FalseC->getValueType(0), Cond);
15299        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
15300                           SDValue(FalseC, 0));
15301
15302        if (N->getNumValues() == 2)  // Dead flag value?
15303          return DCI.CombineTo(N, Cond, SDValue());
15304        return Cond;
15305      }
15306
15307      // Optimize cases that will turn into an LEA instruction.  This requires
15308      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
15309      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
15310        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
15311        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
15312
15313        bool isFastMultiplier = false;
15314        if (Diff < 10) {
15315          switch ((unsigned char)Diff) {
15316          default: break;
15317          case 1:  // result = add base, cond
15318          case 2:  // result = lea base(    , cond*2)
15319          case 3:  // result = lea base(cond, cond*2)
15320          case 4:  // result = lea base(    , cond*4)
15321          case 5:  // result = lea base(cond, cond*4)
15322          case 8:  // result = lea base(    , cond*8)
15323          case 9:  // result = lea base(cond, cond*8)
15324            isFastMultiplier = true;
15325            break;
15326          }
15327        }
15328
15329        if (isFastMultiplier) {
15330          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
15331          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15332                             DAG.getConstant(CC, MVT::i8), Cond);
15333          // Zero extend the condition if needed.
15334          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
15335                             Cond);
15336          // Scale the condition by the difference.
15337          if (Diff != 1)
15338            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
15339                               DAG.getConstant(Diff, Cond.getValueType()));
15340
15341          // Add the base if non-zero.
15342          if (FalseC->getAPIntValue() != 0)
15343            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
15344                               SDValue(FalseC, 0));
15345          if (N->getNumValues() == 2)  // Dead flag value?
15346            return DCI.CombineTo(N, Cond, SDValue());
15347          return Cond;
15348        }
15349      }
15350    }
15351  }
15352
15353  // Handle these cases:
15354  //   (select (x != c), e, c) -> select (x != c), e, x),
15355  //   (select (x == c), c, e) -> select (x == c), x, e)
15356  // where the c is an integer constant, and the "select" is the combination
15357  // of CMOV and CMP.
15358  //
15359  // The rationale for this change is that the conditional-move from a constant
15360  // needs two instructions, however, conditional-move from a register needs
15361  // only one instruction.
15362  //
15363  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
15364  //  some instruction-combining opportunities. This opt needs to be
15365  //  postponed as late as possible.
15366  //
15367  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
15368    // the DCI.xxxx conditions are provided to postpone the optimization as
15369    // late as possible.
15370
15371    ConstantSDNode *CmpAgainst = 0;
15372    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
15373        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
15374        dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
15375
15376      if (CC == X86::COND_NE &&
15377          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
15378        CC = X86::GetOppositeBranchCondition(CC);
15379        std::swap(TrueOp, FalseOp);
15380      }
15381
15382      if (CC == X86::COND_E &&
15383          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
15384        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
15385                          DAG.getConstant(CC, MVT::i8), Cond };
15386        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
15387                           array_lengthof(Ops));
15388      }
15389    }
15390  }
15391
15392  return SDValue();
15393}
15394
15395/// PerformMulCombine - Optimize a single multiply with constant into two
15396/// in order to implement it with two cheaper instructions, e.g.
15397/// LEA + SHL, LEA + LEA.
15398static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
15399                                 TargetLowering::DAGCombinerInfo &DCI) {
15400  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15401    return SDValue();
15402
15403  EVT VT = N->getValueType(0);
15404  if (VT != MVT::i64)
15405    return SDValue();
15406
15407  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
15408  if (!C)
15409    return SDValue();
15410  uint64_t MulAmt = C->getZExtValue();
15411  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
15412    return SDValue();
15413
15414  uint64_t MulAmt1 = 0;
15415  uint64_t MulAmt2 = 0;
15416  if ((MulAmt % 9) == 0) {
15417    MulAmt1 = 9;
15418    MulAmt2 = MulAmt / 9;
15419  } else if ((MulAmt % 5) == 0) {
15420    MulAmt1 = 5;
15421    MulAmt2 = MulAmt / 5;
15422  } else if ((MulAmt % 3) == 0) {
15423    MulAmt1 = 3;
15424    MulAmt2 = MulAmt / 3;
15425  }
15426  if (MulAmt2 &&
15427      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
15428    DebugLoc DL = N->getDebugLoc();
15429
15430    if (isPowerOf2_64(MulAmt2) &&
15431        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
15432      // If second multiplifer is pow2, issue it first. We want the multiply by
15433      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
15434      // is an add.
15435      std::swap(MulAmt1, MulAmt2);
15436
15437    SDValue NewMul;
15438    if (isPowerOf2_64(MulAmt1))
15439      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
15440                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
15441    else
15442      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
15443                           DAG.getConstant(MulAmt1, VT));
15444
15445    if (isPowerOf2_64(MulAmt2))
15446      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
15447                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
15448    else
15449      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
15450                           DAG.getConstant(MulAmt2, VT));
15451
15452    // Do not add new nodes to DAG combiner worklist.
15453    DCI.CombineTo(N, NewMul, false);
15454  }
15455  return SDValue();
15456}
15457
15458static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
15459  SDValue N0 = N->getOperand(0);
15460  SDValue N1 = N->getOperand(1);
15461  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
15462  EVT VT = N0.getValueType();
15463
15464  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
15465  // since the result of setcc_c is all zero's or all ones.
15466  if (VT.isInteger() && !VT.isVector() &&
15467      N1C && N0.getOpcode() == ISD::AND &&
15468      N0.getOperand(1).getOpcode() == ISD::Constant) {
15469    SDValue N00 = N0.getOperand(0);
15470    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
15471        ((N00.getOpcode() == ISD::ANY_EXTEND ||
15472          N00.getOpcode() == ISD::ZERO_EXTEND) &&
15473         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
15474      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
15475      APInt ShAmt = N1C->getAPIntValue();
15476      Mask = Mask.shl(ShAmt);
15477      if (Mask != 0)
15478        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
15479                           N00, DAG.getConstant(Mask, VT));
15480    }
15481  }
15482
15483  // Hardware support for vector shifts is sparse which makes us scalarize the
15484  // vector operations in many cases. Also, on sandybridge ADD is faster than
15485  // shl.
15486  // (shl V, 1) -> add V,V
15487  if (isSplatVector(N1.getNode())) {
15488    assert(N0.getValueType().isVector() && "Invalid vector shift type");
15489    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
15490    // We shift all of the values by one. In many cases we do not have
15491    // hardware support for this operation. This is better expressed as an ADD
15492    // of two values.
15493    if (N1C && (1 == N1C->getZExtValue())) {
15494      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
15495    }
15496  }
15497
15498  return SDValue();
15499}
15500
15501/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
15502///                       when possible.
15503static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
15504                                   TargetLowering::DAGCombinerInfo &DCI,
15505                                   const X86Subtarget *Subtarget) {
15506  EVT VT = N->getValueType(0);
15507  if (N->getOpcode() == ISD::SHL) {
15508    SDValue V = PerformSHLCombine(N, DAG);
15509    if (V.getNode()) return V;
15510  }
15511
15512  // On X86 with SSE2 support, we can transform this to a vector shift if
15513  // all elements are shifted by the same amount.  We can't do this in legalize
15514  // because the a constant vector is typically transformed to a constant pool
15515  // so we have no knowledge of the shift amount.
15516  if (!Subtarget->hasSSE2())
15517    return SDValue();
15518
15519  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
15520      (!Subtarget->hasInt256() ||
15521       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
15522    return SDValue();
15523
15524  SDValue ShAmtOp = N->getOperand(1);
15525  EVT EltVT = VT.getVectorElementType();
15526  DebugLoc DL = N->getDebugLoc();
15527  SDValue BaseShAmt = SDValue();
15528  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
15529    unsigned NumElts = VT.getVectorNumElements();
15530    unsigned i = 0;
15531    for (; i != NumElts; ++i) {
15532      SDValue Arg = ShAmtOp.getOperand(i);
15533      if (Arg.getOpcode() == ISD::UNDEF) continue;
15534      BaseShAmt = Arg;
15535      break;
15536    }
15537    // Handle the case where the build_vector is all undef
15538    // FIXME: Should DAG allow this?
15539    if (i == NumElts)
15540      return SDValue();
15541
15542    for (; i != NumElts; ++i) {
15543      SDValue Arg = ShAmtOp.getOperand(i);
15544      if (Arg.getOpcode() == ISD::UNDEF) continue;
15545      if (Arg != BaseShAmt) {
15546        return SDValue();
15547      }
15548    }
15549  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
15550             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
15551    SDValue InVec = ShAmtOp.getOperand(0);
15552    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
15553      unsigned NumElts = InVec.getValueType().getVectorNumElements();
15554      unsigned i = 0;
15555      for (; i != NumElts; ++i) {
15556        SDValue Arg = InVec.getOperand(i);
15557        if (Arg.getOpcode() == ISD::UNDEF) continue;
15558        BaseShAmt = Arg;
15559        break;
15560      }
15561    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15562       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
15563         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
15564         if (C->getZExtValue() == SplatIdx)
15565           BaseShAmt = InVec.getOperand(1);
15566       }
15567    }
15568    if (BaseShAmt.getNode() == 0) {
15569      // Don't create instructions with illegal types after legalize
15570      // types has run.
15571      if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
15572          !DCI.isBeforeLegalize())
15573        return SDValue();
15574
15575      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
15576                              DAG.getIntPtrConstant(0));
15577    }
15578  } else
15579    return SDValue();
15580
15581  // The shift amount is an i32.
15582  if (EltVT.bitsGT(MVT::i32))
15583    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
15584  else if (EltVT.bitsLT(MVT::i32))
15585    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
15586
15587  // The shift amount is identical so we can do a vector shift.
15588  SDValue  ValOp = N->getOperand(0);
15589  switch (N->getOpcode()) {
15590  default:
15591    llvm_unreachable("Unknown shift opcode!");
15592  case ISD::SHL:
15593    switch (VT.getSimpleVT().SimpleTy) {
15594    default: return SDValue();
15595    case MVT::v2i64:
15596    case MVT::v4i32:
15597    case MVT::v8i16:
15598    case MVT::v4i64:
15599    case MVT::v8i32:
15600    case MVT::v16i16:
15601      return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
15602    }
15603  case ISD::SRA:
15604    switch (VT.getSimpleVT().SimpleTy) {
15605    default: return SDValue();
15606    case MVT::v4i32:
15607    case MVT::v8i16:
15608    case MVT::v8i32:
15609    case MVT::v16i16:
15610      return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
15611    }
15612  case ISD::SRL:
15613    switch (VT.getSimpleVT().SimpleTy) {
15614    default: return SDValue();
15615    case MVT::v2i64:
15616    case MVT::v4i32:
15617    case MVT::v8i16:
15618    case MVT::v4i64:
15619    case MVT::v8i32:
15620    case MVT::v16i16:
15621      return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
15622    }
15623  }
15624}
15625
15626// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
15627// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
15628// and friends.  Likewise for OR -> CMPNEQSS.
15629static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
15630                            TargetLowering::DAGCombinerInfo &DCI,
15631                            const X86Subtarget *Subtarget) {
15632  unsigned opcode;
15633
15634  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
15635  // we're requiring SSE2 for both.
15636  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
15637    SDValue N0 = N->getOperand(0);
15638    SDValue N1 = N->getOperand(1);
15639    SDValue CMP0 = N0->getOperand(1);
15640    SDValue CMP1 = N1->getOperand(1);
15641    DebugLoc DL = N->getDebugLoc();
15642
15643    // The SETCCs should both refer to the same CMP.
15644    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
15645      return SDValue();
15646
15647    SDValue CMP00 = CMP0->getOperand(0);
15648    SDValue CMP01 = CMP0->getOperand(1);
15649    EVT     VT    = CMP00.getValueType();
15650
15651    if (VT == MVT::f32 || VT == MVT::f64) {
15652      bool ExpectingFlags = false;
15653      // Check for any users that want flags:
15654      for (SDNode::use_iterator UI = N->use_begin(),
15655             UE = N->use_end();
15656           !ExpectingFlags && UI != UE; ++UI)
15657        switch (UI->getOpcode()) {
15658        default:
15659        case ISD::BR_CC:
15660        case ISD::BRCOND:
15661        case ISD::SELECT:
15662          ExpectingFlags = true;
15663          break;
15664        case ISD::CopyToReg:
15665        case ISD::SIGN_EXTEND:
15666        case ISD::ZERO_EXTEND:
15667        case ISD::ANY_EXTEND:
15668          break;
15669        }
15670
15671      if (!ExpectingFlags) {
15672        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
15673        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
15674
15675        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
15676          X86::CondCode tmp = cc0;
15677          cc0 = cc1;
15678          cc1 = tmp;
15679        }
15680
15681        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
15682            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
15683          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
15684          X86ISD::NodeType NTOperator = is64BitFP ?
15685            X86ISD::FSETCCsd : X86ISD::FSETCCss;
15686          // FIXME: need symbolic constants for these magic numbers.
15687          // See X86ATTInstPrinter.cpp:printSSECC().
15688          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
15689          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
15690                                              DAG.getConstant(x86cc, MVT::i8));
15691          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
15692                                              OnesOrZeroesF);
15693          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
15694                                      DAG.getConstant(1, MVT::i32));
15695          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
15696          return OneBitOfTruth;
15697        }
15698      }
15699    }
15700  }
15701  return SDValue();
15702}
15703
15704/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
15705/// so it can be folded inside ANDNP.
15706static bool CanFoldXORWithAllOnes(const SDNode *N) {
15707  EVT VT = N->getValueType(0);
15708
15709  // Match direct AllOnes for 128 and 256-bit vectors
15710  if (ISD::isBuildVectorAllOnes(N))
15711    return true;
15712
15713  // Look through a bit convert.
15714  if (N->getOpcode() == ISD::BITCAST)
15715    N = N->getOperand(0).getNode();
15716
15717  // Sometimes the operand may come from a insert_subvector building a 256-bit
15718  // allones vector
15719  if (VT.is256BitVector() &&
15720      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
15721    SDValue V1 = N->getOperand(0);
15722    SDValue V2 = N->getOperand(1);
15723
15724    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
15725        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
15726        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
15727        ISD::isBuildVectorAllOnes(V2.getNode()))
15728      return true;
15729  }
15730
15731  return false;
15732}
15733
15734// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
15735// register. In most cases we actually compare or select YMM-sized registers
15736// and mixing the two types creates horrible code. This method optimizes
15737// some of the transition sequences.
15738static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
15739                                 TargetLowering::DAGCombinerInfo &DCI,
15740                                 const X86Subtarget *Subtarget) {
15741  EVT VT = N->getValueType(0);
15742  if (VT.getSizeInBits() != 256)
15743    return SDValue();
15744
15745  assert((N->getOpcode() == ISD::ANY_EXTEND ||
15746          N->getOpcode() == ISD::ZERO_EXTEND ||
15747          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
15748
15749  SDValue Narrow = N->getOperand(0);
15750  EVT NarrowVT = Narrow->getValueType(0);
15751  if (NarrowVT.getSizeInBits() != 128)
15752    return SDValue();
15753
15754  if (Narrow->getOpcode() != ISD::XOR &&
15755      Narrow->getOpcode() != ISD::AND &&
15756      Narrow->getOpcode() != ISD::OR)
15757    return SDValue();
15758
15759  SDValue N0  = Narrow->getOperand(0);
15760  SDValue N1  = Narrow->getOperand(1);
15761  DebugLoc DL = Narrow->getDebugLoc();
15762
15763  // The Left side has to be a trunc.
15764  if (N0.getOpcode() != ISD::TRUNCATE)
15765    return SDValue();
15766
15767  // The type of the truncated inputs.
15768  EVT WideVT = N0->getOperand(0)->getValueType(0);
15769  if (WideVT != VT)
15770    return SDValue();
15771
15772  // The right side has to be a 'trunc' or a constant vector.
15773  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
15774  bool RHSConst = (isSplatVector(N1.getNode()) &&
15775                   isa<ConstantSDNode>(N1->getOperand(0)));
15776  if (!RHSTrunc && !RHSConst)
15777    return SDValue();
15778
15779  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15780
15781  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
15782    return SDValue();
15783
15784  // Set N0 and N1 to hold the inputs to the new wide operation.
15785  N0 = N0->getOperand(0);
15786  if (RHSConst) {
15787    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
15788                     N1->getOperand(0));
15789    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
15790    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
15791  } else if (RHSTrunc) {
15792    N1 = N1->getOperand(0);
15793  }
15794
15795  // Generate the wide operation.
15796  SDValue Op = DAG.getNode(N->getOpcode(), DL, WideVT, N0, N1);
15797  unsigned Opcode = N->getOpcode();
15798  switch (Opcode) {
15799  case ISD::ANY_EXTEND:
15800    return Op;
15801  case ISD::ZERO_EXTEND: {
15802    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
15803    APInt Mask = APInt::getAllOnesValue(InBits);
15804    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
15805    return DAG.getNode(ISD::AND, DL, VT,
15806                       Op, DAG.getConstant(Mask, VT));
15807  }
15808  case ISD::SIGN_EXTEND:
15809    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
15810                       Op, DAG.getValueType(NarrowVT));
15811  default:
15812    llvm_unreachable("Unexpected opcode");
15813  }
15814}
15815
15816static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
15817                                 TargetLowering::DAGCombinerInfo &DCI,
15818                                 const X86Subtarget *Subtarget) {
15819  EVT VT = N->getValueType(0);
15820  if (DCI.isBeforeLegalizeOps())
15821    return SDValue();
15822
15823  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
15824  if (R.getNode())
15825    return R;
15826
15827  // Create BLSI, and BLSR instructions
15828  // BLSI is X & (-X)
15829  // BLSR is X & (X-1)
15830  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
15831    SDValue N0 = N->getOperand(0);
15832    SDValue N1 = N->getOperand(1);
15833    DebugLoc DL = N->getDebugLoc();
15834
15835    // Check LHS for neg
15836    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
15837        isZero(N0.getOperand(0)))
15838      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
15839
15840    // Check RHS for neg
15841    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
15842        isZero(N1.getOperand(0)))
15843      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
15844
15845    // Check LHS for X-1
15846    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
15847        isAllOnes(N0.getOperand(1)))
15848      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
15849
15850    // Check RHS for X-1
15851    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
15852        isAllOnes(N1.getOperand(1)))
15853      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
15854
15855    return SDValue();
15856  }
15857
15858  // Want to form ANDNP nodes:
15859  // 1) In the hopes of then easily combining them with OR and AND nodes
15860  //    to form PBLEND/PSIGN.
15861  // 2) To match ANDN packed intrinsics
15862  if (VT != MVT::v2i64 && VT != MVT::v4i64)
15863    return SDValue();
15864
15865  SDValue N0 = N->getOperand(0);
15866  SDValue N1 = N->getOperand(1);
15867  DebugLoc DL = N->getDebugLoc();
15868
15869  // Check LHS for vnot
15870  if (N0.getOpcode() == ISD::XOR &&
15871      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
15872      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
15873    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
15874
15875  // Check RHS for vnot
15876  if (N1.getOpcode() == ISD::XOR &&
15877      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
15878      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
15879    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
15880
15881  return SDValue();
15882}
15883
15884static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
15885                                TargetLowering::DAGCombinerInfo &DCI,
15886                                const X86Subtarget *Subtarget) {
15887  EVT VT = N->getValueType(0);
15888  if (DCI.isBeforeLegalizeOps())
15889    return SDValue();
15890
15891  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
15892  if (R.getNode())
15893    return R;
15894
15895  SDValue N0 = N->getOperand(0);
15896  SDValue N1 = N->getOperand(1);
15897
15898  // look for psign/blend
15899  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
15900    if (!Subtarget->hasSSSE3() ||
15901        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
15902      return SDValue();
15903
15904    // Canonicalize pandn to RHS
15905    if (N0.getOpcode() == X86ISD::ANDNP)
15906      std::swap(N0, N1);
15907    // or (and (m, y), (pandn m, x))
15908    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
15909      SDValue Mask = N1.getOperand(0);
15910      SDValue X    = N1.getOperand(1);
15911      SDValue Y;
15912      if (N0.getOperand(0) == Mask)
15913        Y = N0.getOperand(1);
15914      if (N0.getOperand(1) == Mask)
15915        Y = N0.getOperand(0);
15916
15917      // Check to see if the mask appeared in both the AND and ANDNP and
15918      if (!Y.getNode())
15919        return SDValue();
15920
15921      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
15922      // Look through mask bitcast.
15923      if (Mask.getOpcode() == ISD::BITCAST)
15924        Mask = Mask.getOperand(0);
15925      if (X.getOpcode() == ISD::BITCAST)
15926        X = X.getOperand(0);
15927      if (Y.getOpcode() == ISD::BITCAST)
15928        Y = Y.getOperand(0);
15929
15930      EVT MaskVT = Mask.getValueType();
15931
15932      // Validate that the Mask operand is a vector sra node.
15933      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
15934      // there is no psrai.b
15935      if (Mask.getOpcode() != X86ISD::VSRAI)
15936        return SDValue();
15937
15938      // Check that the SRA is all signbits.
15939      SDValue SraC = Mask.getOperand(1);
15940      unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
15941      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
15942      if ((SraAmt + 1) != EltBits)
15943        return SDValue();
15944
15945      DebugLoc DL = N->getDebugLoc();
15946
15947      // We are going to replace the AND, OR, NAND with either BLEND
15948      // or PSIGN, which only look at the MSB. The VSRAI instruction
15949      // does not affect the highest bit, so we can get rid of it.
15950      Mask = Mask.getOperand(0);
15951
15952      // Now we know we at least have a plendvb with the mask val.  See if
15953      // we can form a psignb/w/d.
15954      // psign = x.type == y.type == mask.type && y = sub(0, x);
15955      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
15956          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
15957          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
15958        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
15959               "Unsupported VT for PSIGN");
15960        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask);
15961        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
15962      }
15963      // PBLENDVB only available on SSE 4.1
15964      if (!Subtarget->hasSSE41())
15965        return SDValue();
15966
15967      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
15968
15969      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
15970      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
15971      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
15972      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
15973      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
15974    }
15975  }
15976
15977  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
15978    return SDValue();
15979
15980  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
15981  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
15982    std::swap(N0, N1);
15983  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
15984    return SDValue();
15985  if (!N0.hasOneUse() || !N1.hasOneUse())
15986    return SDValue();
15987
15988  SDValue ShAmt0 = N0.getOperand(1);
15989  if (ShAmt0.getValueType() != MVT::i8)
15990    return SDValue();
15991  SDValue ShAmt1 = N1.getOperand(1);
15992  if (ShAmt1.getValueType() != MVT::i8)
15993    return SDValue();
15994  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
15995    ShAmt0 = ShAmt0.getOperand(0);
15996  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
15997    ShAmt1 = ShAmt1.getOperand(0);
15998
15999  DebugLoc DL = N->getDebugLoc();
16000  unsigned Opc = X86ISD::SHLD;
16001  SDValue Op0 = N0.getOperand(0);
16002  SDValue Op1 = N1.getOperand(0);
16003  if (ShAmt0.getOpcode() == ISD::SUB) {
16004    Opc = X86ISD::SHRD;
16005    std::swap(Op0, Op1);
16006    std::swap(ShAmt0, ShAmt1);
16007  }
16008
16009  unsigned Bits = VT.getSizeInBits();
16010  if (ShAmt1.getOpcode() == ISD::SUB) {
16011    SDValue Sum = ShAmt1.getOperand(0);
16012    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
16013      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
16014      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
16015        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
16016      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
16017        return DAG.getNode(Opc, DL, VT,
16018                           Op0, Op1,
16019                           DAG.getNode(ISD::TRUNCATE, DL,
16020                                       MVT::i8, ShAmt0));
16021    }
16022  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
16023    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
16024    if (ShAmt0C &&
16025        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
16026      return DAG.getNode(Opc, DL, VT,
16027                         N0.getOperand(0), N1.getOperand(0),
16028                         DAG.getNode(ISD::TRUNCATE, DL,
16029                                       MVT::i8, ShAmt0));
16030  }
16031
16032  return SDValue();
16033}
16034
16035// Generate NEG and CMOV for integer abs.
16036static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
16037  EVT VT = N->getValueType(0);
16038
16039  // Since X86 does not have CMOV for 8-bit integer, we don't convert
16040  // 8-bit integer abs to NEG and CMOV.
16041  if (VT.isInteger() && VT.getSizeInBits() == 8)
16042    return SDValue();
16043
16044  SDValue N0 = N->getOperand(0);
16045  SDValue N1 = N->getOperand(1);
16046  DebugLoc DL = N->getDebugLoc();
16047
16048  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
16049  // and change it to SUB and CMOV.
16050  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
16051      N0.getOpcode() == ISD::ADD &&
16052      N0.getOperand(1) == N1 &&
16053      N1.getOpcode() == ISD::SRA &&
16054      N1.getOperand(0) == N0.getOperand(0))
16055    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
16056      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
16057        // Generate SUB & CMOV.
16058        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
16059                                  DAG.getConstant(0, VT), N0.getOperand(0));
16060
16061        SDValue Ops[] = { N0.getOperand(0), Neg,
16062                          DAG.getConstant(X86::COND_GE, MVT::i8),
16063                          SDValue(Neg.getNode(), 1) };
16064        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
16065                           Ops, array_lengthof(Ops));
16066      }
16067  return SDValue();
16068}
16069
16070// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
16071static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
16072                                 TargetLowering::DAGCombinerInfo &DCI,
16073                                 const X86Subtarget *Subtarget) {
16074  EVT VT = N->getValueType(0);
16075  if (DCI.isBeforeLegalizeOps())
16076    return SDValue();
16077
16078  if (Subtarget->hasCMov()) {
16079    SDValue RV = performIntegerAbsCombine(N, DAG);
16080    if (RV.getNode())
16081      return RV;
16082  }
16083
16084  // Try forming BMI if it is available.
16085  if (!Subtarget->hasBMI())
16086    return SDValue();
16087
16088  if (VT != MVT::i32 && VT != MVT::i64)
16089    return SDValue();
16090
16091  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
16092
16093  // Create BLSMSK instructions by finding X ^ (X-1)
16094  SDValue N0 = N->getOperand(0);
16095  SDValue N1 = N->getOperand(1);
16096  DebugLoc DL = N->getDebugLoc();
16097
16098  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
16099      isAllOnes(N0.getOperand(1)))
16100    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
16101
16102  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
16103      isAllOnes(N1.getOperand(1)))
16104    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
16105
16106  return SDValue();
16107}
16108
16109/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
16110static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
16111                                  TargetLowering::DAGCombinerInfo &DCI,
16112                                  const X86Subtarget *Subtarget) {
16113  LoadSDNode *Ld = cast<LoadSDNode>(N);
16114  EVT RegVT = Ld->getValueType(0);
16115  EVT MemVT = Ld->getMemoryVT();
16116  DebugLoc dl = Ld->getDebugLoc();
16117  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16118
16119  ISD::LoadExtType Ext = Ld->getExtensionType();
16120
16121  // If this is a vector EXT Load then attempt to optimize it using a
16122  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
16123  // expansion is still better than scalar code.
16124  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
16125  // emit a shuffle and a arithmetic shift.
16126  // TODO: It is possible to support ZExt by zeroing the undef values
16127  // during the shuffle phase or after the shuffle.
16128  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
16129      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
16130    assert(MemVT != RegVT && "Cannot extend to the same type");
16131    assert(MemVT.isVector() && "Must load a vector from memory");
16132
16133    unsigned NumElems = RegVT.getVectorNumElements();
16134    unsigned RegSz = RegVT.getSizeInBits();
16135    unsigned MemSz = MemVT.getSizeInBits();
16136    assert(RegSz > MemSz && "Register size must be greater than the mem size");
16137
16138    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
16139      return SDValue();
16140
16141    // All sizes must be a power of two.
16142    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
16143      return SDValue();
16144
16145    // Attempt to load the original value using scalar loads.
16146    // Find the largest scalar type that divides the total loaded size.
16147    MVT SclrLoadTy = MVT::i8;
16148    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
16149         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
16150      MVT Tp = (MVT::SimpleValueType)tp;
16151      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16152        SclrLoadTy = Tp;
16153      }
16154    }
16155
16156    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16157    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16158        (64 <= MemSz))
16159      SclrLoadTy = MVT::f64;
16160
16161    // Calculate the number of scalar loads that we need to perform
16162    // in order to load our vector from memory.
16163    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16164    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
16165      return SDValue();
16166
16167    unsigned loadRegZize = RegSz;
16168    if (Ext == ISD::SEXTLOAD && RegSz == 256)
16169      loadRegZize /= 2;
16170
16171    // Represent our vector as a sequence of elements which are the
16172    // largest scalar that we can load.
16173    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
16174      loadRegZize/SclrLoadTy.getSizeInBits());
16175
16176    // Represent the data using the same element type that is stored in
16177    // memory. In practice, we ''widen'' MemVT.
16178    EVT WideVecVT =
16179	  EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16180                       loadRegZize/MemVT.getScalarType().getSizeInBits());
16181
16182    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16183      "Invalid vector type");
16184
16185    // We can't shuffle using an illegal type.
16186    if (!TLI.isTypeLegal(WideVecVT))
16187      return SDValue();
16188
16189    SmallVector<SDValue, 8> Chains;
16190    SDValue Ptr = Ld->getBasePtr();
16191    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
16192                                        TLI.getPointerTy());
16193    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16194
16195    for (unsigned i = 0; i < NumLoads; ++i) {
16196      // Perform a single load.
16197      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
16198                                       Ptr, Ld->getPointerInfo(),
16199                                       Ld->isVolatile(), Ld->isNonTemporal(),
16200                                       Ld->isInvariant(), Ld->getAlignment());
16201      Chains.push_back(ScalarLoad.getValue(1));
16202      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16203      // another round of DAGCombining.
16204      if (i == 0)
16205        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16206      else
16207        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16208                          ScalarLoad, DAG.getIntPtrConstant(i));
16209
16210      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16211    }
16212
16213    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
16214                               Chains.size());
16215
16216    // Bitcast the loaded value to a vector of the original element type, in
16217    // the size of the target vector type.
16218    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16219    unsigned SizeRatio = RegSz/MemSz;
16220
16221    if (Ext == ISD::SEXTLOAD) {
16222      // If we have SSE4.1 we can directly emit a VSEXT node.
16223      if (Subtarget->hasSSE41()) {
16224        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16225        return DCI.CombineTo(N, Sext, TF, true);
16226      }
16227
16228      // Otherwise we'll shuffle the small elements in the high bits of the
16229      // larger type and perform an arithmetic shift. If the shift is not legal
16230      // it's better to scalarize.
16231      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
16232        return SDValue();
16233
16234      // Redistribute the loaded elements into the different locations.
16235      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16236      for (unsigned i = 0; i != NumElems; ++i)
16237        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
16238
16239      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16240                                           DAG.getUNDEF(WideVecVT),
16241                                           &ShuffleVec[0]);
16242
16243      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16244
16245      // Build the arithmetic shift.
16246      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16247                     MemVT.getVectorElementType().getSizeInBits();
16248      SmallVector<SDValue, 8> C(NumElems,
16249                                DAG.getConstant(Amt, RegVT.getScalarType()));
16250      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
16251      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
16252
16253      return DCI.CombineTo(N, Shuff, TF, true);
16254    }
16255
16256    // Redistribute the loaded elements into the different locations.
16257    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16258    for (unsigned i = 0; i != NumElems; ++i)
16259      ShuffleVec[i*SizeRatio] = i;
16260
16261    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16262                                         DAG.getUNDEF(WideVecVT),
16263                                         &ShuffleVec[0]);
16264
16265    // Bitcast to the requested type.
16266    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16267    // Replace the original load with the new sequence
16268    // and return the new chain.
16269    return DCI.CombineTo(N, Shuff, TF, true);
16270  }
16271
16272  return SDValue();
16273}
16274
16275/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
16276static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
16277                                   const X86Subtarget *Subtarget) {
16278  StoreSDNode *St = cast<StoreSDNode>(N);
16279  EVT VT = St->getValue().getValueType();
16280  EVT StVT = St->getMemoryVT();
16281  DebugLoc dl = St->getDebugLoc();
16282  SDValue StoredVal = St->getOperand(1);
16283  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16284
16285  // If we are saving a concatenation of two XMM registers, perform two stores.
16286  // On Sandy Bridge, 256-bit memory operations are executed by two
16287  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
16288  // memory  operation.
16289  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
16290      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
16291      StoredVal.getNumOperands() == 2) {
16292    SDValue Value0 = StoredVal.getOperand(0);
16293    SDValue Value1 = StoredVal.getOperand(1);
16294
16295    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
16296    SDValue Ptr0 = St->getBasePtr();
16297    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
16298
16299    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
16300                                St->getPointerInfo(), St->isVolatile(),
16301                                St->isNonTemporal(), St->getAlignment());
16302    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
16303                                St->getPointerInfo(), St->isVolatile(),
16304                                St->isNonTemporal(), St->getAlignment());
16305    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
16306  }
16307
16308  // Optimize trunc store (of multiple scalars) to shuffle and store.
16309  // First, pack all of the elements in one place. Next, store to memory
16310  // in fewer chunks.
16311  if (St->isTruncatingStore() && VT.isVector()) {
16312    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16313    unsigned NumElems = VT.getVectorNumElements();
16314    assert(StVT != VT && "Cannot truncate to the same type");
16315    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
16316    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
16317
16318    // From, To sizes and ElemCount must be pow of two
16319    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
16320    // We are going to use the original vector elt for storing.
16321    // Accumulated smaller vector elements must be a multiple of the store size.
16322    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
16323
16324    unsigned SizeRatio  = FromSz / ToSz;
16325
16326    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
16327
16328    // Create a type on which we perform the shuffle
16329    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
16330            StVT.getScalarType(), NumElems*SizeRatio);
16331
16332    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16333
16334    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
16335    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16336    for (unsigned i = 0; i != NumElems; ++i)
16337      ShuffleVec[i] = i * SizeRatio;
16338
16339    // Can't shuffle using an illegal type.
16340    if (!TLI.isTypeLegal(WideVecVT))
16341      return SDValue();
16342
16343    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
16344                                         DAG.getUNDEF(WideVecVT),
16345                                         &ShuffleVec[0]);
16346    // At this point all of the data is stored at the bottom of the
16347    // register. We now need to save it to mem.
16348
16349    // Find the largest store unit
16350    MVT StoreType = MVT::i8;
16351    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
16352         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
16353      MVT Tp = (MVT::SimpleValueType)tp;
16354      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
16355        StoreType = Tp;
16356    }
16357
16358    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16359    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
16360        (64 <= NumElems * ToSz))
16361      StoreType = MVT::f64;
16362
16363    // Bitcast the original vector into a vector of store-size units
16364    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
16365            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
16366    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16367    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
16368    SmallVector<SDValue, 8> Chains;
16369    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
16370                                        TLI.getPointerTy());
16371    SDValue Ptr = St->getBasePtr();
16372
16373    // Perform one or more big stores into memory.
16374    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
16375      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
16376                                   StoreType, ShuffWide,
16377                                   DAG.getIntPtrConstant(i));
16378      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
16379                                St->getPointerInfo(), St->isVolatile(),
16380                                St->isNonTemporal(), St->getAlignment());
16381      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16382      Chains.push_back(Ch);
16383    }
16384
16385    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
16386                               Chains.size());
16387  }
16388
16389  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
16390  // the FP state in cases where an emms may be missing.
16391  // A preferable solution to the general problem is to figure out the right
16392  // places to insert EMMS.  This qualifies as a quick hack.
16393
16394  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
16395  if (VT.getSizeInBits() != 64)
16396    return SDValue();
16397
16398  const Function *F = DAG.getMachineFunction().getFunction();
16399  bool NoImplicitFloatOps = F->getFnAttributes().
16400    hasAttribute(Attribute::NoImplicitFloat);
16401  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
16402                     && Subtarget->hasSSE2();
16403  if ((VT.isVector() ||
16404       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
16405      isa<LoadSDNode>(St->getValue()) &&
16406      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
16407      St->getChain().hasOneUse() && !St->isVolatile()) {
16408    SDNode* LdVal = St->getValue().getNode();
16409    LoadSDNode *Ld = 0;
16410    int TokenFactorIndex = -1;
16411    SmallVector<SDValue, 8> Ops;
16412    SDNode* ChainVal = St->getChain().getNode();
16413    // Must be a store of a load.  We currently handle two cases:  the load
16414    // is a direct child, and it's under an intervening TokenFactor.  It is
16415    // possible to dig deeper under nested TokenFactors.
16416    if (ChainVal == LdVal)
16417      Ld = cast<LoadSDNode>(St->getChain());
16418    else if (St->getValue().hasOneUse() &&
16419             ChainVal->getOpcode() == ISD::TokenFactor) {
16420      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
16421        if (ChainVal->getOperand(i).getNode() == LdVal) {
16422          TokenFactorIndex = i;
16423          Ld = cast<LoadSDNode>(St->getValue());
16424        } else
16425          Ops.push_back(ChainVal->getOperand(i));
16426      }
16427    }
16428
16429    if (!Ld || !ISD::isNormalLoad(Ld))
16430      return SDValue();
16431
16432    // If this is not the MMX case, i.e. we are just turning i64 load/store
16433    // into f64 load/store, avoid the transformation if there are multiple
16434    // uses of the loaded value.
16435    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
16436      return SDValue();
16437
16438    DebugLoc LdDL = Ld->getDebugLoc();
16439    DebugLoc StDL = N->getDebugLoc();
16440    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
16441    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
16442    // pair instead.
16443    if (Subtarget->is64Bit() || F64IsLegal) {
16444      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
16445      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
16446                                  Ld->getPointerInfo(), Ld->isVolatile(),
16447                                  Ld->isNonTemporal(), Ld->isInvariant(),
16448                                  Ld->getAlignment());
16449      SDValue NewChain = NewLd.getValue(1);
16450      if (TokenFactorIndex != -1) {
16451        Ops.push_back(NewChain);
16452        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
16453                               Ops.size());
16454      }
16455      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
16456                          St->getPointerInfo(),
16457                          St->isVolatile(), St->isNonTemporal(),
16458                          St->getAlignment());
16459    }
16460
16461    // Otherwise, lower to two pairs of 32-bit loads / stores.
16462    SDValue LoAddr = Ld->getBasePtr();
16463    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
16464                                 DAG.getConstant(4, MVT::i32));
16465
16466    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
16467                               Ld->getPointerInfo(),
16468                               Ld->isVolatile(), Ld->isNonTemporal(),
16469                               Ld->isInvariant(), Ld->getAlignment());
16470    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
16471                               Ld->getPointerInfo().getWithOffset(4),
16472                               Ld->isVolatile(), Ld->isNonTemporal(),
16473                               Ld->isInvariant(),
16474                               MinAlign(Ld->getAlignment(), 4));
16475
16476    SDValue NewChain = LoLd.getValue(1);
16477    if (TokenFactorIndex != -1) {
16478      Ops.push_back(LoLd);
16479      Ops.push_back(HiLd);
16480      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
16481                             Ops.size());
16482    }
16483
16484    LoAddr = St->getBasePtr();
16485    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
16486                         DAG.getConstant(4, MVT::i32));
16487
16488    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
16489                                St->getPointerInfo(),
16490                                St->isVolatile(), St->isNonTemporal(),
16491                                St->getAlignment());
16492    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
16493                                St->getPointerInfo().getWithOffset(4),
16494                                St->isVolatile(),
16495                                St->isNonTemporal(),
16496                                MinAlign(St->getAlignment(), 4));
16497    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
16498  }
16499  return SDValue();
16500}
16501
16502/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
16503/// and return the operands for the horizontal operation in LHS and RHS.  A
16504/// horizontal operation performs the binary operation on successive elements
16505/// of its first operand, then on successive elements of its second operand,
16506/// returning the resulting values in a vector.  For example, if
16507///   A = < float a0, float a1, float a2, float a3 >
16508/// and
16509///   B = < float b0, float b1, float b2, float b3 >
16510/// then the result of doing a horizontal operation on A and B is
16511///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
16512/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
16513/// A horizontal-op B, for some already available A and B, and if so then LHS is
16514/// set to A, RHS to B, and the routine returns 'true'.
16515/// Note that the binary operation should have the property that if one of the
16516/// operands is UNDEF then the result is UNDEF.
16517static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
16518  // Look for the following pattern: if
16519  //   A = < float a0, float a1, float a2, float a3 >
16520  //   B = < float b0, float b1, float b2, float b3 >
16521  // and
16522  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
16523  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
16524  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
16525  // which is A horizontal-op B.
16526
16527  // At least one of the operands should be a vector shuffle.
16528  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
16529      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
16530    return false;
16531
16532  EVT VT = LHS.getValueType();
16533
16534  assert((VT.is128BitVector() || VT.is256BitVector()) &&
16535         "Unsupported vector type for horizontal add/sub");
16536
16537  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
16538  // operate independently on 128-bit lanes.
16539  unsigned NumElts = VT.getVectorNumElements();
16540  unsigned NumLanes = VT.getSizeInBits()/128;
16541  unsigned NumLaneElts = NumElts / NumLanes;
16542  assert((NumLaneElts % 2 == 0) &&
16543         "Vector type should have an even number of elements in each lane");
16544  unsigned HalfLaneElts = NumLaneElts/2;
16545
16546  // View LHS in the form
16547  //   LHS = VECTOR_SHUFFLE A, B, LMask
16548  // If LHS is not a shuffle then pretend it is the shuffle
16549  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
16550  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
16551  // type VT.
16552  SDValue A, B;
16553  SmallVector<int, 16> LMask(NumElts);
16554  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
16555    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
16556      A = LHS.getOperand(0);
16557    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
16558      B = LHS.getOperand(1);
16559    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
16560    std::copy(Mask.begin(), Mask.end(), LMask.begin());
16561  } else {
16562    if (LHS.getOpcode() != ISD::UNDEF)
16563      A = LHS;
16564    for (unsigned i = 0; i != NumElts; ++i)
16565      LMask[i] = i;
16566  }
16567
16568  // Likewise, view RHS in the form
16569  //   RHS = VECTOR_SHUFFLE C, D, RMask
16570  SDValue C, D;
16571  SmallVector<int, 16> RMask(NumElts);
16572  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
16573    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
16574      C = RHS.getOperand(0);
16575    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
16576      D = RHS.getOperand(1);
16577    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
16578    std::copy(Mask.begin(), Mask.end(), RMask.begin());
16579  } else {
16580    if (RHS.getOpcode() != ISD::UNDEF)
16581      C = RHS;
16582    for (unsigned i = 0; i != NumElts; ++i)
16583      RMask[i] = i;
16584  }
16585
16586  // Check that the shuffles are both shuffling the same vectors.
16587  if (!(A == C && B == D) && !(A == D && B == C))
16588    return false;
16589
16590  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
16591  if (!A.getNode() && !B.getNode())
16592    return false;
16593
16594  // If A and B occur in reverse order in RHS, then "swap" them (which means
16595  // rewriting the mask).
16596  if (A != C)
16597    CommuteVectorShuffleMask(RMask, NumElts);
16598
16599  // At this point LHS and RHS are equivalent to
16600  //   LHS = VECTOR_SHUFFLE A, B, LMask
16601  //   RHS = VECTOR_SHUFFLE A, B, RMask
16602  // Check that the masks correspond to performing a horizontal operation.
16603  for (unsigned i = 0; i != NumElts; ++i) {
16604    int LIdx = LMask[i], RIdx = RMask[i];
16605
16606    // Ignore any UNDEF components.
16607    if (LIdx < 0 || RIdx < 0 ||
16608        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
16609        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
16610      continue;
16611
16612    // Check that successive elements are being operated on.  If not, this is
16613    // not a horizontal operation.
16614    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
16615    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
16616    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
16617    if (!(LIdx == Index && RIdx == Index + 1) &&
16618        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
16619      return false;
16620  }
16621
16622  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
16623  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
16624  return true;
16625}
16626
16627/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
16628static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
16629                                  const X86Subtarget *Subtarget) {
16630  EVT VT = N->getValueType(0);
16631  SDValue LHS = N->getOperand(0);
16632  SDValue RHS = N->getOperand(1);
16633
16634  // Try to synthesize horizontal adds from adds of shuffles.
16635  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
16636       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
16637      isHorizontalBinOp(LHS, RHS, true))
16638    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
16639  return SDValue();
16640}
16641
16642/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
16643static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
16644                                  const X86Subtarget *Subtarget) {
16645  EVT VT = N->getValueType(0);
16646  SDValue LHS = N->getOperand(0);
16647  SDValue RHS = N->getOperand(1);
16648
16649  // Try to synthesize horizontal subs from subs of shuffles.
16650  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
16651       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
16652      isHorizontalBinOp(LHS, RHS, false))
16653    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
16654  return SDValue();
16655}
16656
16657/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
16658/// X86ISD::FXOR nodes.
16659static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
16660  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
16661  // F[X]OR(0.0, x) -> x
16662  // F[X]OR(x, 0.0) -> x
16663  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
16664    if (C->getValueAPF().isPosZero())
16665      return N->getOperand(1);
16666  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
16667    if (C->getValueAPF().isPosZero())
16668      return N->getOperand(0);
16669  return SDValue();
16670}
16671
16672/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
16673/// X86ISD::FMAX nodes.
16674static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
16675  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
16676
16677  // Only perform optimizations if UnsafeMath is used.
16678  if (!DAG.getTarget().Options.UnsafeFPMath)
16679    return SDValue();
16680
16681  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
16682  // into FMINC and FMAXC, which are Commutative operations.
16683  unsigned NewOp = 0;
16684  switch (N->getOpcode()) {
16685    default: llvm_unreachable("unknown opcode");
16686    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
16687    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
16688  }
16689
16690  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
16691                     N->getOperand(0), N->getOperand(1));
16692}
16693
16694/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
16695static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
16696  // FAND(0.0, x) -> 0.0
16697  // FAND(x, 0.0) -> 0.0
16698  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
16699    if (C->getValueAPF().isPosZero())
16700      return N->getOperand(0);
16701  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
16702    if (C->getValueAPF().isPosZero())
16703      return N->getOperand(1);
16704  return SDValue();
16705}
16706
16707static SDValue PerformBTCombine(SDNode *N,
16708                                SelectionDAG &DAG,
16709                                TargetLowering::DAGCombinerInfo &DCI) {
16710  // BT ignores high bits in the bit index operand.
16711  SDValue Op1 = N->getOperand(1);
16712  if (Op1.hasOneUse()) {
16713    unsigned BitWidth = Op1.getValueSizeInBits();
16714    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
16715    APInt KnownZero, KnownOne;
16716    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
16717                                          !DCI.isBeforeLegalizeOps());
16718    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16719    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
16720        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
16721      DCI.CommitTargetLoweringOpt(TLO);
16722  }
16723  return SDValue();
16724}
16725
16726static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
16727  SDValue Op = N->getOperand(0);
16728  if (Op.getOpcode() == ISD::BITCAST)
16729    Op = Op.getOperand(0);
16730  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
16731  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
16732      VT.getVectorElementType().getSizeInBits() ==
16733      OpVT.getVectorElementType().getSizeInBits()) {
16734    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
16735  }
16736  return SDValue();
16737}
16738
16739static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
16740                                  TargetLowering::DAGCombinerInfo &DCI,
16741                                  const X86Subtarget *Subtarget) {
16742  if (!DCI.isBeforeLegalizeOps())
16743    return SDValue();
16744
16745  if (!Subtarget->hasFp256())
16746    return SDValue();
16747
16748  EVT VT = N->getValueType(0);
16749  SDValue Op = N->getOperand(0);
16750  EVT OpVT = Op.getValueType();
16751  DebugLoc dl = N->getDebugLoc();
16752
16753  if (VT.isVector() && VT.getSizeInBits() == 256) {
16754    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
16755    if (R.getNode())
16756      return R;
16757  }
16758
16759  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
16760      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
16761
16762    if (Subtarget->hasInt256())
16763      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
16764
16765    // Optimize vectors in AVX mode
16766    // Sign extend  v8i16 to v8i32 and
16767    //              v4i32 to v4i64
16768    //
16769    // Divide input vector into two parts
16770    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16771    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16772    // concat the vectors to original VT
16773
16774    unsigned NumElems = OpVT.getVectorNumElements();
16775    SDValue Undef = DAG.getUNDEF(OpVT);
16776
16777    SmallVector<int,8> ShufMask1(NumElems, -1);
16778    for (unsigned i = 0; i != NumElems/2; ++i)
16779      ShufMask1[i] = i;
16780
16781    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
16782
16783    SmallVector<int,8> ShufMask2(NumElems, -1);
16784    for (unsigned i = 0; i != NumElems/2; ++i)
16785      ShufMask2[i] = i + NumElems/2;
16786
16787    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
16788
16789    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
16790                                  VT.getVectorNumElements()/2);
16791
16792    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
16793    OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
16794
16795    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16796  }
16797  return SDValue();
16798}
16799
16800static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
16801                                 const X86Subtarget* Subtarget) {
16802  DebugLoc dl = N->getDebugLoc();
16803  EVT VT = N->getValueType(0);
16804
16805  // Let legalize expand this if it isn't a legal type yet.
16806  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
16807    return SDValue();
16808
16809  EVT ScalarVT = VT.getScalarType();
16810  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
16811      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
16812    return SDValue();
16813
16814  SDValue A = N->getOperand(0);
16815  SDValue B = N->getOperand(1);
16816  SDValue C = N->getOperand(2);
16817
16818  bool NegA = (A.getOpcode() == ISD::FNEG);
16819  bool NegB = (B.getOpcode() == ISD::FNEG);
16820  bool NegC = (C.getOpcode() == ISD::FNEG);
16821
16822  // Negative multiplication when NegA xor NegB
16823  bool NegMul = (NegA != NegB);
16824  if (NegA)
16825    A = A.getOperand(0);
16826  if (NegB)
16827    B = B.getOperand(0);
16828  if (NegC)
16829    C = C.getOperand(0);
16830
16831  unsigned Opcode;
16832  if (!NegMul)
16833    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
16834  else
16835    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
16836
16837  return DAG.getNode(Opcode, dl, VT, A, B, C);
16838}
16839
16840static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
16841                                  TargetLowering::DAGCombinerInfo &DCI,
16842                                  const X86Subtarget *Subtarget) {
16843  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
16844  //           (and (i32 x86isd::setcc_carry), 1)
16845  // This eliminates the zext. This transformation is necessary because
16846  // ISD::SETCC is always legalized to i8.
16847  DebugLoc dl = N->getDebugLoc();
16848  SDValue N0 = N->getOperand(0);
16849  EVT VT = N->getValueType(0);
16850  EVT OpVT = N0.getValueType();
16851
16852  if (N0.getOpcode() == ISD::AND &&
16853      N0.hasOneUse() &&
16854      N0.getOperand(0).hasOneUse()) {
16855    SDValue N00 = N0.getOperand(0);
16856    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
16857      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
16858      if (!C || C->getZExtValue() != 1)
16859        return SDValue();
16860      return DAG.getNode(ISD::AND, dl, VT,
16861                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
16862                                     N00.getOperand(0), N00.getOperand(1)),
16863                         DAG.getConstant(1, VT));
16864    }
16865  }
16866
16867  if (VT.isVector() && VT.getSizeInBits() == 256) {
16868    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
16869    if (R.getNode())
16870      return R;
16871  }
16872
16873  // Optimize vectors in AVX mode:
16874  //
16875  //   v8i16 -> v8i32
16876  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
16877  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
16878  //   Concat upper and lower parts.
16879  //
16880  //   v4i32 -> v4i64
16881  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
16882  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
16883  //   Concat upper and lower parts.
16884  //
16885  if (!DCI.isBeforeLegalizeOps())
16886    return SDValue();
16887
16888  if (!Subtarget->hasFp256())
16889    return SDValue();
16890
16891  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
16892      ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
16893
16894    if (Subtarget->hasInt256())
16895      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
16896
16897    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
16898    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
16899    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
16900
16901    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
16902                               VT.getVectorNumElements()/2);
16903
16904    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
16905    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
16906
16907    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16908  }
16909
16910  return SDValue();
16911}
16912
16913// Optimize x == -y --> x+y == 0
16914//          x != -y --> x+y != 0
16915static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
16916  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16917  SDValue LHS = N->getOperand(0);
16918  SDValue RHS = N->getOperand(1);
16919
16920  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
16921    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
16922      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
16923        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
16924                                   LHS.getValueType(), RHS, LHS.getOperand(1));
16925        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
16926                            addV, DAG.getConstant(0, addV.getValueType()), CC);
16927      }
16928  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
16929    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
16930      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
16931        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
16932                                   RHS.getValueType(), LHS, RHS.getOperand(1));
16933        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
16934                            addV, DAG.getConstant(0, addV.getValueType()), CC);
16935      }
16936  return SDValue();
16937}
16938
16939// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
16940// as "sbb reg,reg", since it can be extended without zext and produces
16941// an all-ones bit which is more useful than 0/1 in some cases.
16942static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
16943  return DAG.getNode(ISD::AND, DL, MVT::i8,
16944                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
16945                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
16946                     DAG.getConstant(1, MVT::i8));
16947}
16948
16949// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
16950static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
16951                                   TargetLowering::DAGCombinerInfo &DCI,
16952                                   const X86Subtarget *Subtarget) {
16953  DebugLoc DL = N->getDebugLoc();
16954  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
16955  SDValue EFLAGS = N->getOperand(1);
16956
16957  if (CC == X86::COND_A) {
16958    // Try to convert COND_A into COND_B in an attempt to facilitate
16959    // materializing "setb reg".
16960    //
16961    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
16962    // cannot take an immediate as its first operand.
16963    //
16964    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
16965        EFLAGS.getValueType().isInteger() &&
16966        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
16967      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
16968                                   EFLAGS.getNode()->getVTList(),
16969                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
16970      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
16971      return MaterializeSETB(DL, NewEFLAGS, DAG);
16972    }
16973  }
16974
16975  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
16976  // a zext and produces an all-ones bit which is more useful than 0/1 in some
16977  // cases.
16978  if (CC == X86::COND_B)
16979    return MaterializeSETB(DL, EFLAGS, DAG);
16980
16981  SDValue Flags;
16982
16983  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
16984  if (Flags.getNode()) {
16985    SDValue Cond = DAG.getConstant(CC, MVT::i8);
16986    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
16987  }
16988
16989  return SDValue();
16990}
16991
16992// Optimize branch condition evaluation.
16993//
16994static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
16995                                    TargetLowering::DAGCombinerInfo &DCI,
16996                                    const X86Subtarget *Subtarget) {
16997  DebugLoc DL = N->getDebugLoc();
16998  SDValue Chain = N->getOperand(0);
16999  SDValue Dest = N->getOperand(1);
17000  SDValue EFLAGS = N->getOperand(3);
17001  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
17002
17003  SDValue Flags;
17004
17005  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
17006  if (Flags.getNode()) {
17007    SDValue Cond = DAG.getConstant(CC, MVT::i8);
17008    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
17009                       Flags);
17010  }
17011
17012  return SDValue();
17013}
17014
17015static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
17016                                        const X86TargetLowering *XTLI) {
17017  SDValue Op0 = N->getOperand(0);
17018  EVT InVT = Op0->getValueType(0);
17019
17020  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
17021  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
17022    DebugLoc dl = N->getDebugLoc();
17023    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
17024    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
17025    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
17026  }
17027
17028  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
17029  // a 32-bit target where SSE doesn't support i64->FP operations.
17030  if (Op0.getOpcode() == ISD::LOAD) {
17031    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
17032    EVT VT = Ld->getValueType(0);
17033    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
17034        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
17035        !XTLI->getSubtarget()->is64Bit() &&
17036        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
17037      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
17038                                          Ld->getChain(), Op0, DAG);
17039      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
17040      return FILDChain;
17041    }
17042  }
17043  return SDValue();
17044}
17045
17046// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
17047static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
17048                                 X86TargetLowering::DAGCombinerInfo &DCI) {
17049  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
17050  // the result is either zero or one (depending on the input carry bit).
17051  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
17052  if (X86::isZeroNode(N->getOperand(0)) &&
17053      X86::isZeroNode(N->getOperand(1)) &&
17054      // We don't have a good way to replace an EFLAGS use, so only do this when
17055      // dead right now.
17056      SDValue(N, 1).use_empty()) {
17057    DebugLoc DL = N->getDebugLoc();
17058    EVT VT = N->getValueType(0);
17059    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
17060    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
17061                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
17062                                           DAG.getConstant(X86::COND_B,MVT::i8),
17063                                           N->getOperand(2)),
17064                               DAG.getConstant(1, VT));
17065    return DCI.CombineTo(N, Res1, CarryOut);
17066  }
17067
17068  return SDValue();
17069}
17070
17071// fold (add Y, (sete  X, 0)) -> adc  0, Y
17072//      (add Y, (setne X, 0)) -> sbb -1, Y
17073//      (sub (sete  X, 0), Y) -> sbb  0, Y
17074//      (sub (setne X, 0), Y) -> adc -1, Y
17075static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
17076  DebugLoc DL = N->getDebugLoc();
17077
17078  // Look through ZExts.
17079  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
17080  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
17081    return SDValue();
17082
17083  SDValue SetCC = Ext.getOperand(0);
17084  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
17085    return SDValue();
17086
17087  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
17088  if (CC != X86::COND_E && CC != X86::COND_NE)
17089    return SDValue();
17090
17091  SDValue Cmp = SetCC.getOperand(1);
17092  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
17093      !X86::isZeroNode(Cmp.getOperand(1)) ||
17094      !Cmp.getOperand(0).getValueType().isInteger())
17095    return SDValue();
17096
17097  SDValue CmpOp0 = Cmp.getOperand(0);
17098  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
17099                               DAG.getConstant(1, CmpOp0.getValueType()));
17100
17101  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
17102  if (CC == X86::COND_NE)
17103    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
17104                       DL, OtherVal.getValueType(), OtherVal,
17105                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
17106  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
17107                     DL, OtherVal.getValueType(), OtherVal,
17108                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
17109}
17110
17111/// PerformADDCombine - Do target-specific dag combines on integer adds.
17112static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
17113                                 const X86Subtarget *Subtarget) {
17114  EVT VT = N->getValueType(0);
17115  SDValue Op0 = N->getOperand(0);
17116  SDValue Op1 = N->getOperand(1);
17117
17118  // Try to synthesize horizontal adds from adds of shuffles.
17119  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
17120       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
17121      isHorizontalBinOp(Op0, Op1, true))
17122    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
17123
17124  return OptimizeConditionalInDecrement(N, DAG);
17125}
17126
17127static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
17128                                 const X86Subtarget *Subtarget) {
17129  SDValue Op0 = N->getOperand(0);
17130  SDValue Op1 = N->getOperand(1);
17131
17132  // X86 can't encode an immediate LHS of a sub. See if we can push the
17133  // negation into a preceding instruction.
17134  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
17135    // If the RHS of the sub is a XOR with one use and a constant, invert the
17136    // immediate. Then add one to the LHS of the sub so we can turn
17137    // X-Y -> X+~Y+1, saving one register.
17138    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
17139        isa<ConstantSDNode>(Op1.getOperand(1))) {
17140      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
17141      EVT VT = Op0.getValueType();
17142      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
17143                                   Op1.getOperand(0),
17144                                   DAG.getConstant(~XorC, VT));
17145      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
17146                         DAG.getConstant(C->getAPIntValue()+1, VT));
17147    }
17148  }
17149
17150  // Try to synthesize horizontal adds from adds of shuffles.
17151  EVT VT = N->getValueType(0);
17152  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
17153       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
17154      isHorizontalBinOp(Op0, Op1, true))
17155    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
17156
17157  return OptimizeConditionalInDecrement(N, DAG);
17158}
17159
17160/// performVZEXTCombine - Performs build vector combines
17161static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
17162                                        TargetLowering::DAGCombinerInfo &DCI,
17163                                        const X86Subtarget *Subtarget) {
17164  // (vzext (bitcast (vzext (x)) -> (vzext x)
17165  SDValue In = N->getOperand(0);
17166  while (In.getOpcode() == ISD::BITCAST)
17167    In = In.getOperand(0);
17168
17169  if (In.getOpcode() != X86ISD::VZEXT)
17170    return SDValue();
17171
17172  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
17173}
17174
17175SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
17176                                             DAGCombinerInfo &DCI) const {
17177  SelectionDAG &DAG = DCI.DAG;
17178  switch (N->getOpcode()) {
17179  default: break;
17180  case ISD::EXTRACT_VECTOR_ELT:
17181    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
17182  case ISD::VSELECT:
17183  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
17184  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
17185  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
17186  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
17187  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
17188  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
17189  case ISD::SHL:
17190  case ISD::SRA:
17191  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
17192  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
17193  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
17194  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
17195  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
17196  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
17197  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
17198  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
17199  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
17200  case X86ISD::FXOR:
17201  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
17202  case X86ISD::FMIN:
17203  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
17204  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
17205  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
17206  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
17207  case ISD::ANY_EXTEND:
17208  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
17209  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
17210  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
17211  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
17212  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
17213  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
17214  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
17215  case X86ISD::SHUFP:       // Handle all target specific shuffles
17216  case X86ISD::PALIGN:
17217  case X86ISD::UNPCKH:
17218  case X86ISD::UNPCKL:
17219  case X86ISD::MOVHLPS:
17220  case X86ISD::MOVLHPS:
17221  case X86ISD::PSHUFD:
17222  case X86ISD::PSHUFHW:
17223  case X86ISD::PSHUFLW:
17224  case X86ISD::MOVSS:
17225  case X86ISD::MOVSD:
17226  case X86ISD::VPERMILP:
17227  case X86ISD::VPERM2X128:
17228  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
17229  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
17230  }
17231
17232  return SDValue();
17233}
17234
17235/// isTypeDesirableForOp - Return true if the target has native support for
17236/// the specified value type and it is 'desirable' to use the type for the
17237/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
17238/// instruction encodings are longer and some i16 instructions are slow.
17239bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
17240  if (!isTypeLegal(VT))
17241    return false;
17242  if (VT != MVT::i16)
17243    return true;
17244
17245  switch (Opc) {
17246  default:
17247    return true;
17248  case ISD::LOAD:
17249  case ISD::SIGN_EXTEND:
17250  case ISD::ZERO_EXTEND:
17251  case ISD::ANY_EXTEND:
17252  case ISD::SHL:
17253  case ISD::SRL:
17254  case ISD::SUB:
17255  case ISD::ADD:
17256  case ISD::MUL:
17257  case ISD::AND:
17258  case ISD::OR:
17259  case ISD::XOR:
17260    return false;
17261  }
17262}
17263
17264/// IsDesirableToPromoteOp - This method query the target whether it is
17265/// beneficial for dag combiner to promote the specified node. If true, it
17266/// should return the desired promotion type by reference.
17267bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
17268  EVT VT = Op.getValueType();
17269  if (VT != MVT::i16)
17270    return false;
17271
17272  bool Promote = false;
17273  bool Commute = false;
17274  switch (Op.getOpcode()) {
17275  default: break;
17276  case ISD::LOAD: {
17277    LoadSDNode *LD = cast<LoadSDNode>(Op);
17278    // If the non-extending load has a single use and it's not live out, then it
17279    // might be folded.
17280    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
17281                                                     Op.hasOneUse()*/) {
17282      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17283             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
17284        // The only case where we'd want to promote LOAD (rather then it being
17285        // promoted as an operand is when it's only use is liveout.
17286        if (UI->getOpcode() != ISD::CopyToReg)
17287          return false;
17288      }
17289    }
17290    Promote = true;
17291    break;
17292  }
17293  case ISD::SIGN_EXTEND:
17294  case ISD::ZERO_EXTEND:
17295  case ISD::ANY_EXTEND:
17296    Promote = true;
17297    break;
17298  case ISD::SHL:
17299  case ISD::SRL: {
17300    SDValue N0 = Op.getOperand(0);
17301    // Look out for (store (shl (load), x)).
17302    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
17303      return false;
17304    Promote = true;
17305    break;
17306  }
17307  case ISD::ADD:
17308  case ISD::MUL:
17309  case ISD::AND:
17310  case ISD::OR:
17311  case ISD::XOR:
17312    Commute = true;
17313    // fallthrough
17314  case ISD::SUB: {
17315    SDValue N0 = Op.getOperand(0);
17316    SDValue N1 = Op.getOperand(1);
17317    if (!Commute && MayFoldLoad(N1))
17318      return false;
17319    // Avoid disabling potential load folding opportunities.
17320    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
17321      return false;
17322    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
17323      return false;
17324    Promote = true;
17325  }
17326  }
17327
17328  PVT = MVT::i32;
17329  return Promote;
17330}
17331
17332//===----------------------------------------------------------------------===//
17333//                           X86 Inline Assembly Support
17334//===----------------------------------------------------------------------===//
17335
17336namespace {
17337  // Helper to match a string separated by whitespace.
17338  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
17339    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
17340
17341    for (unsigned i = 0, e = args.size(); i != e; ++i) {
17342      StringRef piece(*args[i]);
17343      if (!s.startswith(piece)) // Check if the piece matches.
17344        return false;
17345
17346      s = s.substr(piece.size());
17347      StringRef::size_type pos = s.find_first_not_of(" \t");
17348      if (pos == 0) // We matched a prefix.
17349        return false;
17350
17351      s = s.substr(pos);
17352    }
17353
17354    return s.empty();
17355  }
17356  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
17357}
17358
17359bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
17360  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
17361
17362  std::string AsmStr = IA->getAsmString();
17363
17364  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
17365  if (!Ty || Ty->getBitWidth() % 16 != 0)
17366    return false;
17367
17368  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
17369  SmallVector<StringRef, 4> AsmPieces;
17370  SplitString(AsmStr, AsmPieces, ";\n");
17371
17372  switch (AsmPieces.size()) {
17373  default: return false;
17374  case 1:
17375    // FIXME: this should verify that we are targeting a 486 or better.  If not,
17376    // we will turn this bswap into something that will be lowered to logical
17377    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
17378    // lower so don't worry about this.
17379    // bswap $0
17380    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
17381        matchAsm(AsmPieces[0], "bswapl", "$0") ||
17382        matchAsm(AsmPieces[0], "bswapq", "$0") ||
17383        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
17384        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
17385        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
17386      // No need to check constraints, nothing other than the equivalent of
17387      // "=r,0" would be valid here.
17388      return IntrinsicLowering::LowerToByteSwap(CI);
17389    }
17390
17391    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
17392    if (CI->getType()->isIntegerTy(16) &&
17393        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
17394        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
17395         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
17396      AsmPieces.clear();
17397      const std::string &ConstraintsStr = IA->getConstraintString();
17398      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
17399      std::sort(AsmPieces.begin(), AsmPieces.end());
17400      if (AsmPieces.size() == 4 &&
17401          AsmPieces[0] == "~{cc}" &&
17402          AsmPieces[1] == "~{dirflag}" &&
17403          AsmPieces[2] == "~{flags}" &&
17404          AsmPieces[3] == "~{fpsr}")
17405      return IntrinsicLowering::LowerToByteSwap(CI);
17406    }
17407    break;
17408  case 3:
17409    if (CI->getType()->isIntegerTy(32) &&
17410        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
17411        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
17412        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
17413        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
17414      AsmPieces.clear();
17415      const std::string &ConstraintsStr = IA->getConstraintString();
17416      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
17417      std::sort(AsmPieces.begin(), AsmPieces.end());
17418      if (AsmPieces.size() == 4 &&
17419          AsmPieces[0] == "~{cc}" &&
17420          AsmPieces[1] == "~{dirflag}" &&
17421          AsmPieces[2] == "~{flags}" &&
17422          AsmPieces[3] == "~{fpsr}")
17423        return IntrinsicLowering::LowerToByteSwap(CI);
17424    }
17425
17426    if (CI->getType()->isIntegerTy(64)) {
17427      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
17428      if (Constraints.size() >= 2 &&
17429          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
17430          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
17431        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
17432        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
17433            matchAsm(AsmPieces[1], "bswap", "%edx") &&
17434            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
17435          return IntrinsicLowering::LowerToByteSwap(CI);
17436      }
17437    }
17438    break;
17439  }
17440  return false;
17441}
17442
17443/// getConstraintType - Given a constraint letter, return the type of
17444/// constraint it is for this target.
17445X86TargetLowering::ConstraintType
17446X86TargetLowering::getConstraintType(const std::string &Constraint) const {
17447  if (Constraint.size() == 1) {
17448    switch (Constraint[0]) {
17449    case 'R':
17450    case 'q':
17451    case 'Q':
17452    case 'f':
17453    case 't':
17454    case 'u':
17455    case 'y':
17456    case 'x':
17457    case 'Y':
17458    case 'l':
17459      return C_RegisterClass;
17460    case 'a':
17461    case 'b':
17462    case 'c':
17463    case 'd':
17464    case 'S':
17465    case 'D':
17466    case 'A':
17467      return C_Register;
17468    case 'I':
17469    case 'J':
17470    case 'K':
17471    case 'L':
17472    case 'M':
17473    case 'N':
17474    case 'G':
17475    case 'C':
17476    case 'e':
17477    case 'Z':
17478      return C_Other;
17479    default:
17480      break;
17481    }
17482  }
17483  return TargetLowering::getConstraintType(Constraint);
17484}
17485
17486/// Examine constraint type and operand type and determine a weight value.
17487/// This object must already have been set up with the operand type
17488/// and the current alternative constraint selected.
17489TargetLowering::ConstraintWeight
17490  X86TargetLowering::getSingleConstraintMatchWeight(
17491    AsmOperandInfo &info, const char *constraint) const {
17492  ConstraintWeight weight = CW_Invalid;
17493  Value *CallOperandVal = info.CallOperandVal;
17494    // If we don't have a value, we can't do a match,
17495    // but allow it at the lowest weight.
17496  if (CallOperandVal == NULL)
17497    return CW_Default;
17498  Type *type = CallOperandVal->getType();
17499  // Look at the constraint type.
17500  switch (*constraint) {
17501  default:
17502    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
17503  case 'R':
17504  case 'q':
17505  case 'Q':
17506  case 'a':
17507  case 'b':
17508  case 'c':
17509  case 'd':
17510  case 'S':
17511  case 'D':
17512  case 'A':
17513    if (CallOperandVal->getType()->isIntegerTy())
17514      weight = CW_SpecificReg;
17515    break;
17516  case 'f':
17517  case 't':
17518  case 'u':
17519      if (type->isFloatingPointTy())
17520        weight = CW_SpecificReg;
17521      break;
17522  case 'y':
17523      if (type->isX86_MMXTy() && Subtarget->hasMMX())
17524        weight = CW_SpecificReg;
17525      break;
17526  case 'x':
17527  case 'Y':
17528    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
17529        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
17530      weight = CW_Register;
17531    break;
17532  case 'I':
17533    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
17534      if (C->getZExtValue() <= 31)
17535        weight = CW_Constant;
17536    }
17537    break;
17538  case 'J':
17539    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17540      if (C->getZExtValue() <= 63)
17541        weight = CW_Constant;
17542    }
17543    break;
17544  case 'K':
17545    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17546      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
17547        weight = CW_Constant;
17548    }
17549    break;
17550  case 'L':
17551    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17552      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
17553        weight = CW_Constant;
17554    }
17555    break;
17556  case 'M':
17557    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17558      if (C->getZExtValue() <= 3)
17559        weight = CW_Constant;
17560    }
17561    break;
17562  case 'N':
17563    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17564      if (C->getZExtValue() <= 0xff)
17565        weight = CW_Constant;
17566    }
17567    break;
17568  case 'G':
17569  case 'C':
17570    if (dyn_cast<ConstantFP>(CallOperandVal)) {
17571      weight = CW_Constant;
17572    }
17573    break;
17574  case 'e':
17575    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17576      if ((C->getSExtValue() >= -0x80000000LL) &&
17577          (C->getSExtValue() <= 0x7fffffffLL))
17578        weight = CW_Constant;
17579    }
17580    break;
17581  case 'Z':
17582    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
17583      if (C->getZExtValue() <= 0xffffffff)
17584        weight = CW_Constant;
17585    }
17586    break;
17587  }
17588  return weight;
17589}
17590
17591/// LowerXConstraint - try to replace an X constraint, which matches anything,
17592/// with another that has more specific requirements based on the type of the
17593/// corresponding operand.
17594const char *X86TargetLowering::
17595LowerXConstraint(EVT ConstraintVT) const {
17596  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
17597  // 'f' like normal targets.
17598  if (ConstraintVT.isFloatingPoint()) {
17599    if (Subtarget->hasSSE2())
17600      return "Y";
17601    if (Subtarget->hasSSE1())
17602      return "x";
17603  }
17604
17605  return TargetLowering::LowerXConstraint(ConstraintVT);
17606}
17607
17608/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
17609/// vector.  If it is invalid, don't add anything to Ops.
17610void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
17611                                                     std::string &Constraint,
17612                                                     std::vector<SDValue>&Ops,
17613                                                     SelectionDAG &DAG) const {
17614  SDValue Result(0, 0);
17615
17616  // Only support length 1 constraints for now.
17617  if (Constraint.length() > 1) return;
17618
17619  char ConstraintLetter = Constraint[0];
17620  switch (ConstraintLetter) {
17621  default: break;
17622  case 'I':
17623    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17624      if (C->getZExtValue() <= 31) {
17625        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17626        break;
17627      }
17628    }
17629    return;
17630  case 'J':
17631    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17632      if (C->getZExtValue() <= 63) {
17633        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17634        break;
17635      }
17636    }
17637    return;
17638  case 'K':
17639    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17640      if (isInt<8>(C->getSExtValue())) {
17641        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17642        break;
17643      }
17644    }
17645    return;
17646  case 'N':
17647    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17648      if (C->getZExtValue() <= 255) {
17649        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17650        break;
17651      }
17652    }
17653    return;
17654  case 'e': {
17655    // 32-bit signed value
17656    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17657      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
17658                                           C->getSExtValue())) {
17659        // Widen to 64 bits here to get it sign extended.
17660        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
17661        break;
17662      }
17663    // FIXME gcc accepts some relocatable values here too, but only in certain
17664    // memory models; it's complicated.
17665    }
17666    return;
17667  }
17668  case 'Z': {
17669    // 32-bit unsigned value
17670    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
17671      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
17672                                           C->getZExtValue())) {
17673        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
17674        break;
17675      }
17676    }
17677    // FIXME gcc accepts some relocatable values here too, but only in certain
17678    // memory models; it's complicated.
17679    return;
17680  }
17681  case 'i': {
17682    // Literal immediates are always ok.
17683    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
17684      // Widen to 64 bits here to get it sign extended.
17685      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
17686      break;
17687    }
17688
17689    // In any sort of PIC mode addresses need to be computed at runtime by
17690    // adding in a register or some sort of table lookup.  These can't
17691    // be used as immediates.
17692    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
17693      return;
17694
17695    // If we are in non-pic codegen mode, we allow the address of a global (with
17696    // an optional displacement) to be used with 'i'.
17697    GlobalAddressSDNode *GA = 0;
17698    int64_t Offset = 0;
17699
17700    // Match either (GA), (GA+C), (GA+C1+C2), etc.
17701    while (1) {
17702      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
17703        Offset += GA->getOffset();
17704        break;
17705      } else if (Op.getOpcode() == ISD::ADD) {
17706        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
17707          Offset += C->getZExtValue();
17708          Op = Op.getOperand(0);
17709          continue;
17710        }
17711      } else if (Op.getOpcode() == ISD::SUB) {
17712        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
17713          Offset += -C->getZExtValue();
17714          Op = Op.getOperand(0);
17715          continue;
17716        }
17717      }
17718
17719      // Otherwise, this isn't something we can handle, reject it.
17720      return;
17721    }
17722
17723    const GlobalValue *GV = GA->getGlobal();
17724    // If we require an extra load to get this address, as in PIC mode, we
17725    // can't accept it.
17726    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
17727                                                        getTargetMachine())))
17728      return;
17729
17730    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
17731                                        GA->getValueType(0), Offset);
17732    break;
17733  }
17734  }
17735
17736  if (Result.getNode()) {
17737    Ops.push_back(Result);
17738    return;
17739  }
17740  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
17741}
17742
17743std::pair<unsigned, const TargetRegisterClass*>
17744X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
17745                                                EVT VT) const {
17746  // First, see if this is a constraint that directly corresponds to an LLVM
17747  // register class.
17748  if (Constraint.size() == 1) {
17749    // GCC Constraint Letters
17750    switch (Constraint[0]) {
17751    default: break;
17752      // TODO: Slight differences here in allocation order and leaving
17753      // RIP in the class. Do they matter any more here than they do
17754      // in the normal allocation?
17755    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
17756      if (Subtarget->is64Bit()) {
17757        if (VT == MVT::i32 || VT == MVT::f32)
17758          return std::make_pair(0U, &X86::GR32RegClass);
17759        if (VT == MVT::i16)
17760          return std::make_pair(0U, &X86::GR16RegClass);
17761        if (VT == MVT::i8 || VT == MVT::i1)
17762          return std::make_pair(0U, &X86::GR8RegClass);
17763        if (VT == MVT::i64 || VT == MVT::f64)
17764          return std::make_pair(0U, &X86::GR64RegClass);
17765        break;
17766      }
17767      // 32-bit fallthrough
17768    case 'Q':   // Q_REGS
17769      if (VT == MVT::i32 || VT == MVT::f32)
17770        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
17771      if (VT == MVT::i16)
17772        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
17773      if (VT == MVT::i8 || VT == MVT::i1)
17774        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
17775      if (VT == MVT::i64)
17776        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
17777      break;
17778    case 'r':   // GENERAL_REGS
17779    case 'l':   // INDEX_REGS
17780      if (VT == MVT::i8 || VT == MVT::i1)
17781        return std::make_pair(0U, &X86::GR8RegClass);
17782      if (VT == MVT::i16)
17783        return std::make_pair(0U, &X86::GR16RegClass);
17784      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
17785        return std::make_pair(0U, &X86::GR32RegClass);
17786      return std::make_pair(0U, &X86::GR64RegClass);
17787    case 'R':   // LEGACY_REGS
17788      if (VT == MVT::i8 || VT == MVT::i1)
17789        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
17790      if (VT == MVT::i16)
17791        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
17792      if (VT == MVT::i32 || !Subtarget->is64Bit())
17793        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
17794      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
17795    case 'f':  // FP Stack registers.
17796      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
17797      // value to the correct fpstack register class.
17798      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
17799        return std::make_pair(0U, &X86::RFP32RegClass);
17800      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
17801        return std::make_pair(0U, &X86::RFP64RegClass);
17802      return std::make_pair(0U, &X86::RFP80RegClass);
17803    case 'y':   // MMX_REGS if MMX allowed.
17804      if (!Subtarget->hasMMX()) break;
17805      return std::make_pair(0U, &X86::VR64RegClass);
17806    case 'Y':   // SSE_REGS if SSE2 allowed
17807      if (!Subtarget->hasSSE2()) break;
17808      // FALL THROUGH.
17809    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
17810      if (!Subtarget->hasSSE1()) break;
17811
17812      switch (VT.getSimpleVT().SimpleTy) {
17813      default: break;
17814      // Scalar SSE types.
17815      case MVT::f32:
17816      case MVT::i32:
17817        return std::make_pair(0U, &X86::FR32RegClass);
17818      case MVT::f64:
17819      case MVT::i64:
17820        return std::make_pair(0U, &X86::FR64RegClass);
17821      // Vector types.
17822      case MVT::v16i8:
17823      case MVT::v8i16:
17824      case MVT::v4i32:
17825      case MVT::v2i64:
17826      case MVT::v4f32:
17827      case MVT::v2f64:
17828        return std::make_pair(0U, &X86::VR128RegClass);
17829      // AVX types.
17830      case MVT::v32i8:
17831      case MVT::v16i16:
17832      case MVT::v8i32:
17833      case MVT::v4i64:
17834      case MVT::v8f32:
17835      case MVT::v4f64:
17836        return std::make_pair(0U, &X86::VR256RegClass);
17837      }
17838      break;
17839    }
17840  }
17841
17842  // Use the default implementation in TargetLowering to convert the register
17843  // constraint into a member of a register class.
17844  std::pair<unsigned, const TargetRegisterClass*> Res;
17845  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
17846
17847  // Not found as a standard register?
17848  if (Res.second == 0) {
17849    // Map st(0) -> st(7) -> ST0
17850    if (Constraint.size() == 7 && Constraint[0] == '{' &&
17851        tolower(Constraint[1]) == 's' &&
17852        tolower(Constraint[2]) == 't' &&
17853        Constraint[3] == '(' &&
17854        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
17855        Constraint[5] == ')' &&
17856        Constraint[6] == '}') {
17857
17858      Res.first = X86::ST0+Constraint[4]-'0';
17859      Res.second = &X86::RFP80RegClass;
17860      return Res;
17861    }
17862
17863    // GCC allows "st(0)" to be called just plain "st".
17864    if (StringRef("{st}").equals_lower(Constraint)) {
17865      Res.first = X86::ST0;
17866      Res.second = &X86::RFP80RegClass;
17867      return Res;
17868    }
17869
17870    // flags -> EFLAGS
17871    if (StringRef("{flags}").equals_lower(Constraint)) {
17872      Res.first = X86::EFLAGS;
17873      Res.second = &X86::CCRRegClass;
17874      return Res;
17875    }
17876
17877    // 'A' means EAX + EDX.
17878    if (Constraint == "A") {
17879      Res.first = X86::EAX;
17880      Res.second = &X86::GR32_ADRegClass;
17881      return Res;
17882    }
17883    return Res;
17884  }
17885
17886  // Otherwise, check to see if this is a register class of the wrong value
17887  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
17888  // turn into {ax},{dx}.
17889  if (Res.second->hasType(VT))
17890    return Res;   // Correct type already, nothing to do.
17891
17892  // All of the single-register GCC register classes map their values onto
17893  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
17894  // really want an 8-bit or 32-bit register, map to the appropriate register
17895  // class and return the appropriate register.
17896  if (Res.second == &X86::GR16RegClass) {
17897    if (VT == MVT::i8) {
17898      unsigned DestReg = 0;
17899      switch (Res.first) {
17900      default: break;
17901      case X86::AX: DestReg = X86::AL; break;
17902      case X86::DX: DestReg = X86::DL; break;
17903      case X86::CX: DestReg = X86::CL; break;
17904      case X86::BX: DestReg = X86::BL; break;
17905      }
17906      if (DestReg) {
17907        Res.first = DestReg;
17908        Res.second = &X86::GR8RegClass;
17909      }
17910    } else if (VT == MVT::i32) {
17911      unsigned DestReg = 0;
17912      switch (Res.first) {
17913      default: break;
17914      case X86::AX: DestReg = X86::EAX; break;
17915      case X86::DX: DestReg = X86::EDX; break;
17916      case X86::CX: DestReg = X86::ECX; break;
17917      case X86::BX: DestReg = X86::EBX; break;
17918      case X86::SI: DestReg = X86::ESI; break;
17919      case X86::DI: DestReg = X86::EDI; break;
17920      case X86::BP: DestReg = X86::EBP; break;
17921      case X86::SP: DestReg = X86::ESP; break;
17922      }
17923      if (DestReg) {
17924        Res.first = DestReg;
17925        Res.second = &X86::GR32RegClass;
17926      }
17927    } else if (VT == MVT::i64) {
17928      unsigned DestReg = 0;
17929      switch (Res.first) {
17930      default: break;
17931      case X86::AX: DestReg = X86::RAX; break;
17932      case X86::DX: DestReg = X86::RDX; break;
17933      case X86::CX: DestReg = X86::RCX; break;
17934      case X86::BX: DestReg = X86::RBX; break;
17935      case X86::SI: DestReg = X86::RSI; break;
17936      case X86::DI: DestReg = X86::RDI; break;
17937      case X86::BP: DestReg = X86::RBP; break;
17938      case X86::SP: DestReg = X86::RSP; break;
17939      }
17940      if (DestReg) {
17941        Res.first = DestReg;
17942        Res.second = &X86::GR64RegClass;
17943      }
17944    }
17945  } else if (Res.second == &X86::FR32RegClass ||
17946             Res.second == &X86::FR64RegClass ||
17947             Res.second == &X86::VR128RegClass) {
17948    // Handle references to XMM physical registers that got mapped into the
17949    // wrong class.  This can happen with constraints like {xmm0} where the
17950    // target independent register mapper will just pick the first match it can
17951    // find, ignoring the required type.
17952
17953    if (VT == MVT::f32 || VT == MVT::i32)
17954      Res.second = &X86::FR32RegClass;
17955    else if (VT == MVT::f64 || VT == MVT::i64)
17956      Res.second = &X86::FR64RegClass;
17957    else if (X86::VR128RegClass.hasType(VT))
17958      Res.second = &X86::VR128RegClass;
17959    else if (X86::VR256RegClass.hasType(VT))
17960      Res.second = &X86::VR256RegClass;
17961  }
17962
17963  return Res;
17964}
17965
17966//===----------------------------------------------------------------------===//
17967//
17968// X86 cost model.
17969//
17970//===----------------------------------------------------------------------===//
17971
17972struct X86CostTblEntry {
17973  int ISD;
17974  MVT Type;
17975  unsigned Cost;
17976};
17977
17978static int
17979FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
17980  for (unsigned int i = 0; i < len; ++i)
17981    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
17982      return i;
17983
17984  // Could not find an entry.
17985  return -1;
17986}
17987
17988struct X86TypeConversionCostTblEntry {
17989  int ISD;
17990  MVT Dst;
17991  MVT Src;
17992  unsigned Cost;
17993};
17994
17995static int
17996FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
17997                   int ISD, MVT Dst, MVT Src) {
17998  for (unsigned int i = 0; i < len; ++i)
17999    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
18000      return i;
18001
18002  // Could not find an entry.
18003  return -1;
18004}
18005
18006ScalarTargetTransformInfo::PopcntHwSupport
18007X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
18008  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
18009  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
18010
18011  // TODO: Currently the __builtin_popcount() implementation using SSE3
18012  //   instructions is inefficient. Once the problem is fixed, we should
18013  //   call ST.hasSSE3() instead of ST.hasSSE4().
18014  return ST.hasSSE41() ? Fast : None;
18015}
18016
18017unsigned
18018X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
18019                                                     Type *Ty) const {
18020  // Legalize the type.
18021  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
18022
18023  int ISD = InstructionOpcodeToISD(Opcode);
18024  assert(ISD && "Invalid opcode");
18025
18026  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
18027
18028  static const X86CostTblEntry AVX1CostTable[] = {
18029    // We don't have to scalarize unsupported ops. We can issue two half-sized
18030    // operations and we only need to extract the upper YMM half.
18031    // Two ops + 1 extract + 1 insert = 4.
18032    { ISD::MUL,     MVT::v8i32,    4 },
18033    { ISD::SUB,     MVT::v8i32,    4 },
18034    { ISD::ADD,     MVT::v8i32,    4 },
18035    { ISD::MUL,     MVT::v4i64,    4 },
18036    { ISD::SUB,     MVT::v4i64,    4 },
18037    { ISD::ADD,     MVT::v4i64,    4 },
18038    };
18039
18040  // Look for AVX1 lowering tricks.
18041  if (ST.hasAVX()) {
18042    int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
18043                          LT.second);
18044    if (Idx != -1)
18045      return LT.first * AVX1CostTable[Idx].Cost;
18046  }
18047  // Fallback to the default implementation.
18048  return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
18049}
18050
18051unsigned
18052X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
18053                                              unsigned Alignment,
18054                                              unsigned AddressSpace) const {
18055  // Legalize the type.
18056  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
18057  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
18058         "Invalid Opcode");
18059
18060  const X86Subtarget &ST =
18061  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
18062
18063  // Each load/store unit costs 1.
18064  unsigned Cost = LT.first * 1;
18065
18066  // On Sandybridge 256bit load/stores are double pumped
18067  // (but not on Haswell).
18068  if (LT.second.getSizeInBits() > 128 && !ST.hasAVX2())
18069    Cost*=2;
18070
18071  return Cost;
18072}
18073
18074unsigned
18075X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
18076                                                 unsigned Index) const {
18077  assert(Val->isVectorTy() && "This must be a vector type");
18078
18079  if (Index != -1U) {
18080    // Legalize the type.
18081    std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
18082
18083    // This type is legalized to a scalar type.
18084    if (!LT.second.isVector())
18085      return 0;
18086
18087    // The type may be split. Normalize the index to the new type.
18088    unsigned Width = LT.second.getVectorNumElements();
18089    Index = Index % Width;
18090
18091    // Floating point scalars are already located in index #0.
18092    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
18093      return 0;
18094  }
18095
18096  return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
18097}
18098
18099unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
18100                                                          Type *ValTy,
18101                                                          Type *CondTy) const {
18102  // Legalize the type.
18103  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
18104
18105  MVT MTy = LT.second;
18106
18107  int ISD = InstructionOpcodeToISD(Opcode);
18108  assert(ISD && "Invalid opcode");
18109
18110  const X86Subtarget &ST =
18111  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
18112
18113  static const X86CostTblEntry SSE42CostTbl[] = {
18114    { ISD::SETCC,   MVT::v2f64,   1 },
18115    { ISD::SETCC,   MVT::v4f32,   1 },
18116    { ISD::SETCC,   MVT::v2i64,   1 },
18117    { ISD::SETCC,   MVT::v4i32,   1 },
18118    { ISD::SETCC,   MVT::v8i16,   1 },
18119    { ISD::SETCC,   MVT::v16i8,   1 },
18120  };
18121
18122  static const X86CostTblEntry AVX1CostTbl[] = {
18123    { ISD::SETCC,   MVT::v4f64,   1 },
18124    { ISD::SETCC,   MVT::v8f32,   1 },
18125    // AVX1 does not support 8-wide integer compare.
18126    { ISD::SETCC,   MVT::v4i64,   4 },
18127    { ISD::SETCC,   MVT::v8i32,   4 },
18128    { ISD::SETCC,   MVT::v16i16,  4 },
18129    { ISD::SETCC,   MVT::v32i8,   4 },
18130  };
18131
18132  static const X86CostTblEntry AVX2CostTbl[] = {
18133    { ISD::SETCC,   MVT::v4i64,   1 },
18134    { ISD::SETCC,   MVT::v8i32,   1 },
18135    { ISD::SETCC,   MVT::v16i16,  1 },
18136    { ISD::SETCC,   MVT::v32i8,   1 },
18137  };
18138
18139  if (ST.hasAVX2()) {
18140    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
18141    if (Idx != -1)
18142      return LT.first * AVX2CostTbl[Idx].Cost;
18143  }
18144
18145  if (ST.hasAVX()) {
18146    int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
18147    if (Idx != -1)
18148      return LT.first * AVX1CostTbl[Idx].Cost;
18149  }
18150
18151  if (ST.hasSSE42()) {
18152    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
18153    if (Idx != -1)
18154      return LT.first * SSE42CostTbl[Idx].Cost;
18155  }
18156
18157  return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
18158}
18159
18160unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
18161                                                        Type *Dst,
18162                                                        Type *Src) const {
18163  int ISD = InstructionOpcodeToISD(Opcode);
18164  assert(ISD && "Invalid opcode");
18165
18166  EVT SrcTy = TLI->getValueType(Src);
18167  EVT DstTy = TLI->getValueType(Dst);
18168
18169  if (!SrcTy.isSimple() || !DstTy.isSimple())
18170    return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
18171
18172  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
18173
18174  static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
18175    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
18176    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
18177    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
18178    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
18179    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
18180    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
18181    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
18182    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
18183    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
18184    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
18185    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
18186    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
18187    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
18188    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
18189    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
18190  };
18191
18192  if (ST.hasAVX()) {
18193    int Idx = FindInConvertTable(AVXConversionTbl,
18194                                 array_lengthof(AVXConversionTbl),
18195                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
18196    if (Idx != -1)
18197      return AVXConversionTbl[Idx].Cost;
18198  }
18199
18200  return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
18201}
18202
18203