X86ISelLowering.cpp revision 6643d9c1800df550c071baad9ad770a59d4dd903
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86ISelLowering.h"
17#include "X86.h"
18#include "X86InstrBuilder.h"
19#include "X86TargetMachine.h"
20#include "X86TargetObjectFile.h"
21#include "Utils/X86ShuffleDecode.h"
22#include "llvm/CallingConv.h"
23#include "llvm/Constants.h"
24#include "llvm/DerivedTypes.h"
25#include "llvm/GlobalAlias.h"
26#include "llvm/GlobalVariable.h"
27#include "llvm/Function.h"
28#include "llvm/Instructions.h"
29#include "llvm/Intrinsics.h"
30#include "llvm/LLVMContext.h"
31#include "llvm/CodeGen/IntrinsicLowering.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/MachineFunction.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineJumpTableInfo.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineRegisterInfo.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCContext.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCSymbol.h"
42#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
44#include "llvm/ADT/StringExtras.h"
45#include "llvm/ADT/VariadicFunction.h"
46#include "llvm/Support/CallSite.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/ErrorHandling.h"
49#include "llvm/Support/MathExtras.h"
50#include "llvm/Target/TargetOptions.h"
51#include <bitset>
52using namespace llvm;
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56// Forward declarations.
57static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
58                       SDValue V2);
59
60/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
61/// sets things up to match to an AVX VEXTRACTF128 instruction or a
62/// simple subregister reference.  Idx is an index in the 128 bits we
63/// want.  It need not be aligned to a 128-bit bounday.  That makes
64/// lowering EXTRACT_VECTOR_ELT operations easier.
65static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
66                                   SelectionDAG &DAG, DebugLoc dl) {
67  EVT VT = Vec.getValueType();
68  assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
69  EVT ElVT = VT.getVectorElementType();
70  unsigned Factor = VT.getSizeInBits()/128;
71  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
72                                  VT.getVectorNumElements()/Factor);
73
74  // Extract from UNDEF is UNDEF.
75  if (Vec.getOpcode() == ISD::UNDEF)
76    return DAG.getUNDEF(ResultVT);
77
78  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
79  // we can match to VEXTRACTF128.
80  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
81
82  // This is the index of the first element of the 128-bit chunk
83  // we want.
84  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
85                               * ElemsPerChunk);
86
87  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
88  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
89                               VecIdx);
90
91  return Result;
92}
93
94/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
95/// sets things up to match to an AVX VINSERTF128 instruction or a
96/// simple superregister reference.  Idx is an index in the 128 bits
97/// we want.  It need not be aligned to a 128-bit bounday.  That makes
98/// lowering INSERT_VECTOR_ELT operations easier.
99static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
100                                  unsigned IdxVal, SelectionDAG &DAG,
101                                  DebugLoc dl) {
102  EVT VT = Vec.getValueType();
103  assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
104
105  EVT ElVT = VT.getVectorElementType();
106  EVT ResultVT = Result.getValueType();
107
108  // Insert the relevant 128 bits.
109  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
110
111  // This is the index of the first element of the 128-bit chunk
112  // we want.
113  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
114                               * ElemsPerChunk);
115
116  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
117  Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
118                       VecIdx);
119  return Result;
120}
121
122/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
123/// instructions. This is used because creating CONCAT_VECTOR nodes of
124/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
125/// large BUILD_VECTORS.
126static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
127                                   unsigned NumElems, SelectionDAG &DAG,
128                                   DebugLoc dl) {
129  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
130  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
131}
132
133static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
134  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
135  bool is64Bit = Subtarget->is64Bit();
136
137  if (Subtarget->isTargetEnvMacho()) {
138    if (is64Bit)
139      return new X8664_MachoTargetObjectFile();
140    return new TargetLoweringObjectFileMachO();
141  }
142
143  if (Subtarget->isTargetELF())
144    return new TargetLoweringObjectFileELF();
145  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
146    return new TargetLoweringObjectFileCOFF();
147  llvm_unreachable("unknown subtarget type");
148}
149
150X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
151  : TargetLowering(TM, createTLOF(TM)) {
152  Subtarget = &TM.getSubtarget<X86Subtarget>();
153  X86ScalarSSEf64 = Subtarget->hasSSE2();
154  X86ScalarSSEf32 = Subtarget->hasSSE1();
155  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
156
157  RegInfo = TM.getRegisterInfo();
158  TD = getTargetData();
159
160  // Set up the TargetLowering object.
161  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
162
163  // X86 is weird, it always uses i8 for shift amounts and setcc results.
164  setBooleanContents(ZeroOrOneBooleanContent);
165  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
166  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
167
168  // For 64-bit since we have so many registers use the ILP scheduler, for
169  // 32-bit code use the register pressure specific scheduling.
170  // For Atom, always use ILP scheduling.
171  if (Subtarget->isAtom())
172    setSchedulingPreference(Sched::ILP);
173  else if (Subtarget->is64Bit())
174    setSchedulingPreference(Sched::ILP);
175  else
176    setSchedulingPreference(Sched::RegPressure);
177  setStackPointerRegisterToSaveRestore(X86StackPtr);
178
179  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
180    // Setup Windows compiler runtime calls.
181    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
182    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
183    setLibcallName(RTLIB::SREM_I64, "_allrem");
184    setLibcallName(RTLIB::UREM_I64, "_aullrem");
185    setLibcallName(RTLIB::MUL_I64, "_allmul");
186    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
187    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
188    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
189    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
190    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
191
192    // The _ftol2 runtime function has an unusual calling conv, which
193    // is modeled by a special pseudo-instruction.
194    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
195    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
196    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
197    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
198  }
199
200  if (Subtarget->isTargetDarwin()) {
201    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
202    setUseUnderscoreSetJmp(false);
203    setUseUnderscoreLongJmp(false);
204  } else if (Subtarget->isTargetMingw()) {
205    // MS runtime is weird: it exports _setjmp, but longjmp!
206    setUseUnderscoreSetJmp(true);
207    setUseUnderscoreLongJmp(false);
208  } else {
209    setUseUnderscoreSetJmp(true);
210    setUseUnderscoreLongJmp(true);
211  }
212
213  // Set up the register classes.
214  addRegisterClass(MVT::i8, &X86::GR8RegClass);
215  addRegisterClass(MVT::i16, &X86::GR16RegClass);
216  addRegisterClass(MVT::i32, &X86::GR32RegClass);
217  if (Subtarget->is64Bit())
218    addRegisterClass(MVT::i64, &X86::GR64RegClass);
219
220  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
221
222  // We don't accept any truncstore of integer registers.
223  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
224  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
225  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
226  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
227  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
228  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
229
230  // SETOEQ and SETUNE require checking two conditions.
231  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
232  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
233  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
234  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
235  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
236  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
237
238  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
239  // operation.
240  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
241  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
242  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
243
244  if (Subtarget->is64Bit()) {
245    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
246    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
247  } else if (!TM.Options.UseSoftFloat) {
248    // We have an algorithm for SSE2->double, and we turn this into a
249    // 64-bit FILD followed by conditional FADD for other targets.
250    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
251    // We have an algorithm for SSE2, and we turn this into a 64-bit
252    // FILD for other targets.
253    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
254  }
255
256  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
257  // this operation.
258  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
259  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
260
261  if (!TM.Options.UseSoftFloat) {
262    // SSE has no i16 to fp conversion, only i32
263    if (X86ScalarSSEf32) {
264      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
265      // f32 and f64 cases are Legal, f80 case is not
266      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
267    } else {
268      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
269      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
270    }
271  } else {
272    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
273    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
274  }
275
276  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
277  // are Legal, f80 is custom lowered.
278  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
279  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
280
281  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
282  // this operation.
283  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
284  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
285
286  if (X86ScalarSSEf32) {
287    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
288    // f32 and f64 cases are Legal, f80 case is not
289    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
290  } else {
291    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
292    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
293  }
294
295  // Handle FP_TO_UINT by promoting the destination to a larger signed
296  // conversion.
297  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
298  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
299  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
300
301  if (Subtarget->is64Bit()) {
302    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
303    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
304  } else if (!TM.Options.UseSoftFloat) {
305    // Since AVX is a superset of SSE3, only check for SSE here.
306    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
307      // Expand FP_TO_UINT into a select.
308      // FIXME: We would like to use a Custom expander here eventually to do
309      // the optimal thing for SSE vs. the default expansion in the legalizer.
310      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
311    else
312      // With SSE3 we can use fisttpll to convert to a signed i64; without
313      // SSE, we're stuck with a fistpll.
314      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
315  }
316
317  if (isTargetFTOL()) {
318    // Use the _ftol2 runtime function, which has a pseudo-instruction
319    // to handle its weird calling convention.
320    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
321  }
322
323  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
324  if (!X86ScalarSSEf64) {
325    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
326    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
327    if (Subtarget->is64Bit()) {
328      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
329      // Without SSE, i64->f64 goes through memory.
330      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
331    }
332  }
333
334  // Scalar integer divide and remainder are lowered to use operations that
335  // produce two results, to match the available instructions. This exposes
336  // the two-result form to trivial CSE, which is able to combine x/y and x%y
337  // into a single instruction.
338  //
339  // Scalar integer multiply-high is also lowered to use two-result
340  // operations, to match the available instructions. However, plain multiply
341  // (low) operations are left as Legal, as there are single-result
342  // instructions for this in x86. Using the two-result multiply instructions
343  // when both high and low results are needed must be arranged by dagcombine.
344  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
345    MVT VT = IntVTs[i];
346    setOperationAction(ISD::MULHS, VT, Expand);
347    setOperationAction(ISD::MULHU, VT, Expand);
348    setOperationAction(ISD::SDIV, VT, Expand);
349    setOperationAction(ISD::UDIV, VT, Expand);
350    setOperationAction(ISD::SREM, VT, Expand);
351    setOperationAction(ISD::UREM, VT, Expand);
352
353    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
354    setOperationAction(ISD::ADDC, VT, Custom);
355    setOperationAction(ISD::ADDE, VT, Custom);
356    setOperationAction(ISD::SUBC, VT, Custom);
357    setOperationAction(ISD::SUBE, VT, Custom);
358  }
359
360  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
361  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
362  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
363  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
364  if (Subtarget->is64Bit())
365    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
366  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
367  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
368  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
369  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
370  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
371  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
372  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
373  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
374
375  // Promote the i8 variants and force them on up to i32 which has a shorter
376  // encoding.
377  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
378  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
379  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
380  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
381  if (Subtarget->hasBMI()) {
382    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
383    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
384    if (Subtarget->is64Bit())
385      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
386  } else {
387    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
388    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
389    if (Subtarget->is64Bit())
390      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
391  }
392
393  if (Subtarget->hasLZCNT()) {
394    // When promoting the i8 variants, force them to i32 for a shorter
395    // encoding.
396    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
397    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
398    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
399    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
400    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
401    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
402    if (Subtarget->is64Bit())
403      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
404  } else {
405    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
406    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
407    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
408    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
409    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
410    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
411    if (Subtarget->is64Bit()) {
412      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
413      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
414    }
415  }
416
417  if (Subtarget->hasPOPCNT()) {
418    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
419  } else {
420    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
421    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
422    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
423    if (Subtarget->is64Bit())
424      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
425  }
426
427  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
428  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
429
430  // These should be promoted to a larger select which is supported.
431  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
432  // X86 wants to expand cmov itself.
433  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
434  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
435  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
436  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
437  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
438  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
439  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
440  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
441  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
442  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
443  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
444  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
445  if (Subtarget->is64Bit()) {
446    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
447    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
448  }
449  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
450
451  // Darwin ABI issue.
452  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
453  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
454  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
455  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
456  if (Subtarget->is64Bit())
457    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
458  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
459  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
460  if (Subtarget->is64Bit()) {
461    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
462    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
463    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
464    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
465    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
466  }
467  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
468  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
469  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
470  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
471  if (Subtarget->is64Bit()) {
472    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
473    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
474    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
475  }
476
477  if (Subtarget->hasSSE1())
478    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
479
480  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
481  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
482
483  // On X86 and X86-64, atomic operations are lowered to locked instructions.
484  // Locked instructions, in turn, have implicit fence semantics (all memory
485  // operations are flushed before issuing the locked instruction, and they
486  // are not buffered), so we can fold away the common pattern of
487  // fence-atomic-fence.
488  setShouldFoldAtomicFences(true);
489
490  // Expand certain atomics
491  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
492    MVT VT = IntVTs[i];
493    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
494    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
495    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
496  }
497
498  if (!Subtarget->is64Bit()) {
499    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
500    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
501    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
502    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
503    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
504    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
505    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
506    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
507  }
508
509  if (Subtarget->hasCmpxchg16b()) {
510    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
511  }
512
513  // FIXME - use subtarget debug flags
514  if (!Subtarget->isTargetDarwin() &&
515      !Subtarget->isTargetELF() &&
516      !Subtarget->isTargetCygMing()) {
517    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
518  }
519
520  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
521  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
522  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
523  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
524  if (Subtarget->is64Bit()) {
525    setExceptionPointerRegister(X86::RAX);
526    setExceptionSelectorRegister(X86::RDX);
527  } else {
528    setExceptionPointerRegister(X86::EAX);
529    setExceptionSelectorRegister(X86::EDX);
530  }
531  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
532  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
533
534  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
535  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
536
537  setOperationAction(ISD::TRAP, MVT::Other, Legal);
538
539  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
540  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
541  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
542  if (Subtarget->is64Bit()) {
543    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
544    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
545  } else {
546    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
547    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
548  }
549
550  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
551  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
552
553  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
554    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
555                       MVT::i64 : MVT::i32, Custom);
556  else if (TM.Options.EnableSegmentedStacks)
557    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
558                       MVT::i64 : MVT::i32, Custom);
559  else
560    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
561                       MVT::i64 : MVT::i32, Expand);
562
563  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
564    // f32 and f64 use SSE.
565    // Set up the FP register classes.
566    addRegisterClass(MVT::f32, &X86::FR32RegClass);
567    addRegisterClass(MVT::f64, &X86::FR64RegClass);
568
569    // Use ANDPD to simulate FABS.
570    setOperationAction(ISD::FABS , MVT::f64, Custom);
571    setOperationAction(ISD::FABS , MVT::f32, Custom);
572
573    // Use XORP to simulate FNEG.
574    setOperationAction(ISD::FNEG , MVT::f64, Custom);
575    setOperationAction(ISD::FNEG , MVT::f32, Custom);
576
577    // Use ANDPD and ORPD to simulate FCOPYSIGN.
578    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
579    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
580
581    // Lower this to FGETSIGNx86 plus an AND.
582    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
583    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
584
585    // We don't support sin/cos/fmod
586    setOperationAction(ISD::FSIN , MVT::f64, Expand);
587    setOperationAction(ISD::FCOS , MVT::f64, Expand);
588    setOperationAction(ISD::FSIN , MVT::f32, Expand);
589    setOperationAction(ISD::FCOS , MVT::f32, Expand);
590
591    // Expand FP immediates into loads from the stack, except for the special
592    // cases we handle.
593    addLegalFPImmediate(APFloat(+0.0)); // xorpd
594    addLegalFPImmediate(APFloat(+0.0f)); // xorps
595  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
596    // Use SSE for f32, x87 for f64.
597    // Set up the FP register classes.
598    addRegisterClass(MVT::f32, &X86::FR32RegClass);
599    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
600
601    // Use ANDPS to simulate FABS.
602    setOperationAction(ISD::FABS , MVT::f32, Custom);
603
604    // Use XORP to simulate FNEG.
605    setOperationAction(ISD::FNEG , MVT::f32, Custom);
606
607    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
608
609    // Use ANDPS and ORPS to simulate FCOPYSIGN.
610    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
611    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
612
613    // We don't support sin/cos/fmod
614    setOperationAction(ISD::FSIN , MVT::f32, Expand);
615    setOperationAction(ISD::FCOS , MVT::f32, Expand);
616
617    // Special cases we handle for FP constants.
618    addLegalFPImmediate(APFloat(+0.0f)); // xorps
619    addLegalFPImmediate(APFloat(+0.0)); // FLD0
620    addLegalFPImmediate(APFloat(+1.0)); // FLD1
621    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
622    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
623
624    if (!TM.Options.UnsafeFPMath) {
625      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
626      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
627    }
628  } else if (!TM.Options.UseSoftFloat) {
629    // f32 and f64 in x87.
630    // Set up the FP register classes.
631    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
632    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
633
634    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
635    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
636    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
637    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
638
639    if (!TM.Options.UnsafeFPMath) {
640      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
641      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
642    }
643    addLegalFPImmediate(APFloat(+0.0)); // FLD0
644    addLegalFPImmediate(APFloat(+1.0)); // FLD1
645    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
646    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
647    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
648    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
649    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
650    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
651  }
652
653  // We don't support FMA.
654  setOperationAction(ISD::FMA, MVT::f64, Expand);
655  setOperationAction(ISD::FMA, MVT::f32, Expand);
656
657  // Long double always uses X87.
658  if (!TM.Options.UseSoftFloat) {
659    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
660    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
661    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
662    {
663      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
664      addLegalFPImmediate(TmpFlt);  // FLD0
665      TmpFlt.changeSign();
666      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
667
668      bool ignored;
669      APFloat TmpFlt2(+1.0);
670      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
671                      &ignored);
672      addLegalFPImmediate(TmpFlt2);  // FLD1
673      TmpFlt2.changeSign();
674      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
675    }
676
677    if (!TM.Options.UnsafeFPMath) {
678      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
679      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
680    }
681
682    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
683    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
684    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
685    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
686    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
687    setOperationAction(ISD::FMA, MVT::f80, Expand);
688  }
689
690  // Always use a library call for pow.
691  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
692  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
693  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
694
695  setOperationAction(ISD::FLOG, MVT::f80, Expand);
696  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
697  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
698  setOperationAction(ISD::FEXP, MVT::f80, Expand);
699  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
700
701  // First set operation action for all vector types to either promote
702  // (for widening) or expand (for scalarization). Then we will selectively
703  // turn on ones that can be effectively codegen'd.
704  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
705           VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
706    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
707    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
708    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
709    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
710    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
711    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
712    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
713    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
714    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
715    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
716    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
717    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
718    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
719    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
720    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
721    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
722    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
723    setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
724    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
725    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
726    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
727    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
728    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
729    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
730    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
731    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
732    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
733    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
734    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
735    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
736    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
737    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
738    setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
739    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
740    setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
741    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
742    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
743    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
744    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
745    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
746    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
747    setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
748    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
749    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
750    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
751    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
752    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
753    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
754    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
755    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
756    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
757    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
758    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
759    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
760    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
761    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
762    setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
763    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
764             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
765      setTruncStoreAction((MVT::SimpleValueType)VT,
766                          (MVT::SimpleValueType)InnerVT, Expand);
767    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
768    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
769    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
770  }
771
772  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
773  // with -msoft-float, disable use of MMX as well.
774  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
775    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
776    // No operations on x86mmx supported, everything uses intrinsics.
777  }
778
779  // MMX-sized vectors (other than x86mmx) are expected to be expanded
780  // into smaller operations.
781  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
782  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
783  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
784  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
785  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
786  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
787  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
788  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
789  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
790  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
791  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
792  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
793  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
794  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
795  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
796  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
797  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
798  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
799  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
800  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
801  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
802  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
803  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
804  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
805  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
806  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
807  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
808  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
809  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
810
811  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
812    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
813
814    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
815    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
816    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
817    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
818    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
819    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
820    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
821    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
822    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
823    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
824    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
825    setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
826  }
827
828  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
829    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
830
831    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
832    // registers cannot be used even for integer operations.
833    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
834    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
835    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
836    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
837
838    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
839    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
840    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
841    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
842    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
843    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
844    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
845    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
846    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
847    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
848    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
849    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
850    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
851    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
852    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
853    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
854
855    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
856    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
857    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
858    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
859
860    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
861    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
862    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
863    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
864    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
865
866    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
867    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
868    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
869    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
870    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
871
872    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
873    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
874      EVT VT = (MVT::SimpleValueType)i;
875      // Do not attempt to custom lower non-power-of-2 vectors
876      if (!isPowerOf2_32(VT.getVectorNumElements()))
877        continue;
878      // Do not attempt to custom lower non-128-bit vectors
879      if (!VT.is128BitVector())
880        continue;
881      setOperationAction(ISD::BUILD_VECTOR,
882                         VT.getSimpleVT().SimpleTy, Custom);
883      setOperationAction(ISD::VECTOR_SHUFFLE,
884                         VT.getSimpleVT().SimpleTy, Custom);
885      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
886                         VT.getSimpleVT().SimpleTy, Custom);
887    }
888
889    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
890    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
891    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
892    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
893    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
894    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
895
896    if (Subtarget->is64Bit()) {
897      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
898      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
899    }
900
901    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
902    for (int i = MVT::v16i8; i != MVT::v2i64; i++) {
903      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
904      EVT VT = SVT;
905
906      // Do not attempt to promote non-128-bit vectors
907      if (!VT.is128BitVector())
908        continue;
909
910      setOperationAction(ISD::AND,    SVT, Promote);
911      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
912      setOperationAction(ISD::OR,     SVT, Promote);
913      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
914      setOperationAction(ISD::XOR,    SVT, Promote);
915      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
916      setOperationAction(ISD::LOAD,   SVT, Promote);
917      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
918      setOperationAction(ISD::SELECT, SVT, Promote);
919      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
920    }
921
922    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
923
924    // Custom lower v2i64 and v2f64 selects.
925    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
926    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
927    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
928    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
929
930    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
931    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
932  }
933
934  if (Subtarget->hasSSE41()) {
935    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
936    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
937    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
938    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
939    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
940    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
941    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
942    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
943    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
944    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
945
946    // FIXME: Do we need to handle scalar-to-vector here?
947    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
948
949    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
950    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
951    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
952    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
953    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
954
955    // i8 and i16 vectors are custom , because the source register and source
956    // source memory operand types are not the same width.  f32 vectors are
957    // custom since the immediate controlling the insert encodes additional
958    // information.
959    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
960    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
961    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
962    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
963
964    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
965    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
966    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
967    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
968
969    // FIXME: these should be Legal but thats only for the case where
970    // the index is constant.  For now custom expand to deal with that.
971    if (Subtarget->is64Bit()) {
972      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
973      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
974    }
975  }
976
977  if (Subtarget->hasSSE2()) {
978    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
979    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
980
981    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
982    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
983
984    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
985    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
986
987    if (Subtarget->hasAVX2()) {
988      setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
989      setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
990
991      setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
992      setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
993
994      setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
995    } else {
996      setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
997      setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
998
999      setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
1000      setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
1001
1002      setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
1003    }
1004  }
1005
1006  if (Subtarget->hasSSE42())
1007    setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
1008
1009  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
1010    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1011    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1012    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1013    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1014    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1015    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1016
1017    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1018    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1019    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1020
1021    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1022    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1023    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1024    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1025    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1026    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1027
1028    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1029    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1030    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1031    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1032    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1033    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1034
1035    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1036    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1037    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1038
1039    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
1040    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
1041    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
1042    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
1043    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
1044    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
1045
1046    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1047    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1048
1049    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1050    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1051
1052    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1053    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1054
1055    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1056    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1057    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1058    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1059
1060    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1061    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1062    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1063
1064    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
1065    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
1066    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
1067    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
1068
1069    if (Subtarget->hasAVX2()) {
1070      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1071      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1072      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1073      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1074
1075      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1076      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1077      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1078      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1079
1080      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1081      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1082      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1083      // Don't lower v32i8 because there is no 128-bit byte mul
1084
1085      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1086
1087      setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
1088      setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
1089
1090      setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
1091      setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
1092
1093      setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
1094    } else {
1095      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1096      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1097      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1098      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1099
1100      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1101      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1102      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1103      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1104
1105      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1106      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1107      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1108      // Don't lower v32i8 because there is no 128-bit byte mul
1109
1110      setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
1111      setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
1112
1113      setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
1114      setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
1115
1116      setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
1117    }
1118
1119    // Custom lower several nodes for 256-bit types.
1120    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1121             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1122      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
1123      EVT VT = SVT;
1124
1125      // Extract subvector is special because the value type
1126      // (result) is 128-bit but the source is 256-bit wide.
1127      if (VT.is128BitVector())
1128        setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
1129
1130      // Do not attempt to custom lower other non-256-bit vectors
1131      if (!VT.is256BitVector())
1132        continue;
1133
1134      setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
1135      setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
1136      setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
1137      setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
1138      setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
1139      setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
1140    }
1141
1142    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1143    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1144      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
1145      EVT VT = SVT;
1146
1147      // Do not attempt to promote non-256-bit vectors
1148      if (!VT.is256BitVector())
1149        continue;
1150
1151      setOperationAction(ISD::AND,    SVT, Promote);
1152      AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
1153      setOperationAction(ISD::OR,     SVT, Promote);
1154      AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
1155      setOperationAction(ISD::XOR,    SVT, Promote);
1156      AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
1157      setOperationAction(ISD::LOAD,   SVT, Promote);
1158      AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
1159      setOperationAction(ISD::SELECT, SVT, Promote);
1160      AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
1161    }
1162  }
1163
1164  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1165  // of this type with custom code.
1166  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1167           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1168    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1169                       Custom);
1170  }
1171
1172  // We want to custom lower some of our intrinsics.
1173  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1174
1175
1176  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1177  // handle type legalization for these operations here.
1178  //
1179  // FIXME: We really should do custom legalization for addition and
1180  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1181  // than generic legalization for 64-bit multiplication-with-overflow, though.
1182  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1183    // Add/Sub/Mul with overflow operations are custom lowered.
1184    MVT VT = IntVTs[i];
1185    setOperationAction(ISD::SADDO, VT, Custom);
1186    setOperationAction(ISD::UADDO, VT, Custom);
1187    setOperationAction(ISD::SSUBO, VT, Custom);
1188    setOperationAction(ISD::USUBO, VT, Custom);
1189    setOperationAction(ISD::SMULO, VT, Custom);
1190    setOperationAction(ISD::UMULO, VT, Custom);
1191  }
1192
1193  // There are no 8-bit 3-address imul/mul instructions
1194  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1195  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1196
1197  if (!Subtarget->is64Bit()) {
1198    // These libcalls are not available in 32-bit.
1199    setLibcallName(RTLIB::SHL_I128, 0);
1200    setLibcallName(RTLIB::SRL_I128, 0);
1201    setLibcallName(RTLIB::SRA_I128, 0);
1202  }
1203
1204  // We have target-specific dag combine patterns for the following nodes:
1205  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1206  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1207  setTargetDAGCombine(ISD::VSELECT);
1208  setTargetDAGCombine(ISD::SELECT);
1209  setTargetDAGCombine(ISD::SHL);
1210  setTargetDAGCombine(ISD::SRA);
1211  setTargetDAGCombine(ISD::SRL);
1212  setTargetDAGCombine(ISD::OR);
1213  setTargetDAGCombine(ISD::AND);
1214  setTargetDAGCombine(ISD::ADD);
1215  setTargetDAGCombine(ISD::FADD);
1216  setTargetDAGCombine(ISD::FSUB);
1217  setTargetDAGCombine(ISD::SUB);
1218  setTargetDAGCombine(ISD::LOAD);
1219  setTargetDAGCombine(ISD::STORE);
1220  setTargetDAGCombine(ISD::ZERO_EXTEND);
1221  setTargetDAGCombine(ISD::ANY_EXTEND);
1222  setTargetDAGCombine(ISD::SIGN_EXTEND);
1223  setTargetDAGCombine(ISD::TRUNCATE);
1224  setTargetDAGCombine(ISD::UINT_TO_FP);
1225  setTargetDAGCombine(ISD::SINT_TO_FP);
1226  setTargetDAGCombine(ISD::SETCC);
1227  setTargetDAGCombine(ISD::FP_TO_SINT);
1228  if (Subtarget->is64Bit())
1229    setTargetDAGCombine(ISD::MUL);
1230  if (Subtarget->hasBMI())
1231    setTargetDAGCombine(ISD::XOR);
1232
1233  computeRegisterProperties();
1234
1235  // On Darwin, -Os means optimize for size without hurting performance,
1236  // do not reduce the limit.
1237  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1238  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1239  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1240  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1241  maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1242  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1243  setPrefLoopAlignment(4); // 2^4 bytes.
1244  benefitFromCodePlacementOpt = true;
1245
1246  setPrefFunctionAlignment(4); // 2^4 bytes.
1247}
1248
1249
1250EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
1251  if (!VT.isVector()) return MVT::i8;
1252  return VT.changeVectorElementTypeToInteger();
1253}
1254
1255
1256/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1257/// the desired ByVal argument alignment.
1258static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1259  if (MaxAlign == 16)
1260    return;
1261  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1262    if (VTy->getBitWidth() == 128)
1263      MaxAlign = 16;
1264  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1265    unsigned EltAlign = 0;
1266    getMaxByValAlign(ATy->getElementType(), EltAlign);
1267    if (EltAlign > MaxAlign)
1268      MaxAlign = EltAlign;
1269  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1270    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1271      unsigned EltAlign = 0;
1272      getMaxByValAlign(STy->getElementType(i), EltAlign);
1273      if (EltAlign > MaxAlign)
1274        MaxAlign = EltAlign;
1275      if (MaxAlign == 16)
1276        break;
1277    }
1278  }
1279}
1280
1281/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1282/// function arguments in the caller parameter area. For X86, aggregates
1283/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1284/// are at 4-byte boundaries.
1285unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1286  if (Subtarget->is64Bit()) {
1287    // Max of 8 and alignment of type.
1288    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1289    if (TyAlign > 8)
1290      return TyAlign;
1291    return 8;
1292  }
1293
1294  unsigned Align = 4;
1295  if (Subtarget->hasSSE1())
1296    getMaxByValAlign(Ty, Align);
1297  return Align;
1298}
1299
1300/// getOptimalMemOpType - Returns the target specific optimal type for load
1301/// and store operations as a result of memset, memcpy, and memmove
1302/// lowering. If DstAlign is zero that means it's safe to destination
1303/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1304/// means there isn't a need to check it against alignment requirement,
1305/// probably because the source does not need to be loaded. If
1306/// 'IsZeroVal' is true, that means it's safe to return a
1307/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1308/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1309/// constant so it does not need to be loaded.
1310/// It returns EVT::Other if the type should be determined using generic
1311/// target-independent logic.
1312EVT
1313X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1314                                       unsigned DstAlign, unsigned SrcAlign,
1315                                       bool IsZeroVal,
1316                                       bool MemcpyStrSrc,
1317                                       MachineFunction &MF) const {
1318  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1319  // linux.  This is because the stack realignment code can't handle certain
1320  // cases like PR2962.  This should be removed when PR2962 is fixed.
1321  const Function *F = MF.getFunction();
1322  if (IsZeroVal &&
1323      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1324    if (Size >= 16 &&
1325        (Subtarget->isUnalignedMemAccessFast() ||
1326         ((DstAlign == 0 || DstAlign >= 16) &&
1327          (SrcAlign == 0 || SrcAlign >= 16))) &&
1328        Subtarget->getStackAlignment() >= 16) {
1329      if (Subtarget->getStackAlignment() >= 32) {
1330        if (Subtarget->hasAVX2())
1331          return MVT::v8i32;
1332        if (Subtarget->hasAVX())
1333          return MVT::v8f32;
1334      }
1335      if (Subtarget->hasSSE2())
1336        return MVT::v4i32;
1337      if (Subtarget->hasSSE1())
1338        return MVT::v4f32;
1339    } else if (!MemcpyStrSrc && Size >= 8 &&
1340               !Subtarget->is64Bit() &&
1341               Subtarget->getStackAlignment() >= 8 &&
1342               Subtarget->hasSSE2()) {
1343      // Do not use f64 to lower memcpy if source is string constant. It's
1344      // better to use i32 to avoid the loads.
1345      return MVT::f64;
1346    }
1347  }
1348  if (Subtarget->is64Bit() && Size >= 8)
1349    return MVT::i64;
1350  return MVT::i32;
1351}
1352
1353/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1354/// current function.  The returned value is a member of the
1355/// MachineJumpTableInfo::JTEntryKind enum.
1356unsigned X86TargetLowering::getJumpTableEncoding() const {
1357  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1358  // symbol.
1359  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1360      Subtarget->isPICStyleGOT())
1361    return MachineJumpTableInfo::EK_Custom32;
1362
1363  // Otherwise, use the normal jump table encoding heuristics.
1364  return TargetLowering::getJumpTableEncoding();
1365}
1366
1367const MCExpr *
1368X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1369                                             const MachineBasicBlock *MBB,
1370                                             unsigned uid,MCContext &Ctx) const{
1371  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1372         Subtarget->isPICStyleGOT());
1373  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1374  // entries.
1375  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1376                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1377}
1378
1379/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1380/// jumptable.
1381SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1382                                                    SelectionDAG &DAG) const {
1383  if (!Subtarget->is64Bit())
1384    // This doesn't have DebugLoc associated with it, but is not really the
1385    // same as a Register.
1386    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1387  return Table;
1388}
1389
1390/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1391/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1392/// MCExpr.
1393const MCExpr *X86TargetLowering::
1394getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1395                             MCContext &Ctx) const {
1396  // X86-64 uses RIP relative addressing based on the jump table label.
1397  if (Subtarget->isPICStyleRIPRel())
1398    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1399
1400  // Otherwise, the reference is relative to the PIC base.
1401  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1402}
1403
1404// FIXME: Why this routine is here? Move to RegInfo!
1405std::pair<const TargetRegisterClass*, uint8_t>
1406X86TargetLowering::findRepresentativeClass(EVT VT) const{
1407  const TargetRegisterClass *RRC = 0;
1408  uint8_t Cost = 1;
1409  switch (VT.getSimpleVT().SimpleTy) {
1410  default:
1411    return TargetLowering::findRepresentativeClass(VT);
1412  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1413    RRC = Subtarget->is64Bit() ?
1414      (const TargetRegisterClass*)&X86::GR64RegClass :
1415      (const TargetRegisterClass*)&X86::GR32RegClass;
1416    break;
1417  case MVT::x86mmx:
1418    RRC = &X86::VR64RegClass;
1419    break;
1420  case MVT::f32: case MVT::f64:
1421  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1422  case MVT::v4f32: case MVT::v2f64:
1423  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1424  case MVT::v4f64:
1425    RRC = &X86::VR128RegClass;
1426    break;
1427  }
1428  return std::make_pair(RRC, Cost);
1429}
1430
1431bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1432                                               unsigned &Offset) const {
1433  if (!Subtarget->isTargetLinux())
1434    return false;
1435
1436  if (Subtarget->is64Bit()) {
1437    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1438    Offset = 0x28;
1439    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1440      AddressSpace = 256;
1441    else
1442      AddressSpace = 257;
1443  } else {
1444    // %gs:0x14 on i386
1445    Offset = 0x14;
1446    AddressSpace = 256;
1447  }
1448  return true;
1449}
1450
1451
1452//===----------------------------------------------------------------------===//
1453//               Return Value Calling Convention Implementation
1454//===----------------------------------------------------------------------===//
1455
1456#include "X86GenCallingConv.inc"
1457
1458bool
1459X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1460                                  MachineFunction &MF, bool isVarArg,
1461                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1462                        LLVMContext &Context) const {
1463  SmallVector<CCValAssign, 16> RVLocs;
1464  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1465                 RVLocs, Context);
1466  return CCInfo.CheckReturn(Outs, RetCC_X86);
1467}
1468
1469SDValue
1470X86TargetLowering::LowerReturn(SDValue Chain,
1471                               CallingConv::ID CallConv, bool isVarArg,
1472                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1473                               const SmallVectorImpl<SDValue> &OutVals,
1474                               DebugLoc dl, SelectionDAG &DAG) const {
1475  MachineFunction &MF = DAG.getMachineFunction();
1476  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1477
1478  SmallVector<CCValAssign, 16> RVLocs;
1479  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1480                 RVLocs, *DAG.getContext());
1481  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1482
1483  // Add the regs to the liveout set for the function.
1484  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1485  for (unsigned i = 0; i != RVLocs.size(); ++i)
1486    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1487      MRI.addLiveOut(RVLocs[i].getLocReg());
1488
1489  SDValue Flag;
1490
1491  SmallVector<SDValue, 6> RetOps;
1492  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1493  // Operand #1 = Bytes To Pop
1494  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1495                   MVT::i16));
1496
1497  // Copy the result values into the output registers.
1498  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1499    CCValAssign &VA = RVLocs[i];
1500    assert(VA.isRegLoc() && "Can only return in registers!");
1501    SDValue ValToCopy = OutVals[i];
1502    EVT ValVT = ValToCopy.getValueType();
1503
1504    // If this is x86-64, and we disabled SSE, we can't return FP values,
1505    // or SSE or MMX vectors.
1506    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1507         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1508          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1509      report_fatal_error("SSE register return with SSE disabled");
1510    }
1511    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1512    // llvm-gcc has never done it right and no one has noticed, so this
1513    // should be OK for now.
1514    if (ValVT == MVT::f64 &&
1515        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1516      report_fatal_error("SSE2 register return with SSE2 disabled");
1517
1518    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1519    // the RET instruction and handled by the FP Stackifier.
1520    if (VA.getLocReg() == X86::ST0 ||
1521        VA.getLocReg() == X86::ST1) {
1522      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1523      // change the value to the FP stack register class.
1524      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1525        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1526      RetOps.push_back(ValToCopy);
1527      // Don't emit a copytoreg.
1528      continue;
1529    }
1530
1531    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1532    // which is returned in RAX / RDX.
1533    if (Subtarget->is64Bit()) {
1534      if (ValVT == MVT::x86mmx) {
1535        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1536          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1537          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1538                                  ValToCopy);
1539          // If we don't have SSE2 available, convert to v4f32 so the generated
1540          // register is legal.
1541          if (!Subtarget->hasSSE2())
1542            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1543        }
1544      }
1545    }
1546
1547    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1548    Flag = Chain.getValue(1);
1549  }
1550
1551  // The x86-64 ABI for returning structs by value requires that we copy
1552  // the sret argument into %rax for the return. We saved the argument into
1553  // a virtual register in the entry block, so now we copy the value out
1554  // and into %rax.
1555  if (Subtarget->is64Bit() &&
1556      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1557    MachineFunction &MF = DAG.getMachineFunction();
1558    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1559    unsigned Reg = FuncInfo->getSRetReturnReg();
1560    assert(Reg &&
1561           "SRetReturnReg should have been set in LowerFormalArguments().");
1562    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1563
1564    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1565    Flag = Chain.getValue(1);
1566
1567    // RAX now acts like a return value.
1568    MRI.addLiveOut(X86::RAX);
1569  }
1570
1571  RetOps[0] = Chain;  // Update chain.
1572
1573  // Add the flag if we have it.
1574  if (Flag.getNode())
1575    RetOps.push_back(Flag);
1576
1577  return DAG.getNode(X86ISD::RET_FLAG, dl,
1578                     MVT::Other, &RetOps[0], RetOps.size());
1579}
1580
1581bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1582  if (N->getNumValues() != 1)
1583    return false;
1584  if (!N->hasNUsesOfValue(1, 0))
1585    return false;
1586
1587  SDValue TCChain = Chain;
1588  SDNode *Copy = *N->use_begin();
1589  if (Copy->getOpcode() == ISD::CopyToReg) {
1590    // If the copy has a glue operand, we conservatively assume it isn't safe to
1591    // perform a tail call.
1592    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1593      return false;
1594    TCChain = Copy->getOperand(0);
1595  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1596    return false;
1597
1598  bool HasRet = false;
1599  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1600       UI != UE; ++UI) {
1601    if (UI->getOpcode() != X86ISD::RET_FLAG)
1602      return false;
1603    HasRet = true;
1604  }
1605
1606  if (!HasRet)
1607    return false;
1608
1609  Chain = TCChain;
1610  return true;
1611}
1612
1613EVT
1614X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
1615                                            ISD::NodeType ExtendKind) const {
1616  MVT ReturnMVT;
1617  // TODO: Is this also valid on 32-bit?
1618  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1619    ReturnMVT = MVT::i8;
1620  else
1621    ReturnMVT = MVT::i32;
1622
1623  EVT MinVT = getRegisterType(Context, ReturnMVT);
1624  return VT.bitsLT(MinVT) ? MinVT : VT;
1625}
1626
1627/// LowerCallResult - Lower the result values of a call into the
1628/// appropriate copies out of appropriate physical registers.
1629///
1630SDValue
1631X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1632                                   CallingConv::ID CallConv, bool isVarArg,
1633                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1634                                   DebugLoc dl, SelectionDAG &DAG,
1635                                   SmallVectorImpl<SDValue> &InVals) const {
1636
1637  // Assign locations to each value returned by this call.
1638  SmallVector<CCValAssign, 16> RVLocs;
1639  bool Is64Bit = Subtarget->is64Bit();
1640  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1641                 getTargetMachine(), RVLocs, *DAG.getContext());
1642  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1643
1644  // Copy all of the result registers out of their specified physreg.
1645  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1646    CCValAssign &VA = RVLocs[i];
1647    EVT CopyVT = VA.getValVT();
1648
1649    // If this is x86-64, and we disabled SSE, we can't return FP values
1650    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1651        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1652      report_fatal_error("SSE register return with SSE disabled");
1653    }
1654
1655    SDValue Val;
1656
1657    // If this is a call to a function that returns an fp value on the floating
1658    // point stack, we must guarantee the the value is popped from the stack, so
1659    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1660    // if the return value is not used. We use the FpPOP_RETVAL instruction
1661    // instead.
1662    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1663      // If we prefer to use the value in xmm registers, copy it out as f80 and
1664      // use a truncate to move it from fp stack reg to xmm reg.
1665      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1666      SDValue Ops[] = { Chain, InFlag };
1667      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1668                                         MVT::Other, MVT::Glue, Ops, 2), 1);
1669      Val = Chain.getValue(0);
1670
1671      // Round the f80 to the right size, which also moves it to the appropriate
1672      // xmm register.
1673      if (CopyVT != VA.getValVT())
1674        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1675                          // This truncation won't change the value.
1676                          DAG.getIntPtrConstant(1));
1677    } else {
1678      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1679                                 CopyVT, InFlag).getValue(1);
1680      Val = Chain.getValue(0);
1681    }
1682    InFlag = Chain.getValue(2);
1683    InVals.push_back(Val);
1684  }
1685
1686  return Chain;
1687}
1688
1689
1690//===----------------------------------------------------------------------===//
1691//                C & StdCall & Fast Calling Convention implementation
1692//===----------------------------------------------------------------------===//
1693//  StdCall calling convention seems to be standard for many Windows' API
1694//  routines and around. It differs from C calling convention just a little:
1695//  callee should clean up the stack, not caller. Symbols should be also
1696//  decorated in some fancy way :) It doesn't support any vector arguments.
1697//  For info on fast calling convention see Fast Calling Convention (tail call)
1698//  implementation LowerX86_32FastCCCallTo.
1699
1700/// CallIsStructReturn - Determines whether a call uses struct return
1701/// semantics.
1702static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1703  if (Outs.empty())
1704    return false;
1705
1706  return Outs[0].Flags.isSRet();
1707}
1708
1709/// ArgsAreStructReturn - Determines whether a function uses struct
1710/// return semantics.
1711static bool
1712ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1713  if (Ins.empty())
1714    return false;
1715
1716  return Ins[0].Flags.isSRet();
1717}
1718
1719/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1720/// by "Src" to address "Dst" with size and alignment information specified by
1721/// the specific parameter attribute. The copy will be passed as a byval
1722/// function parameter.
1723static SDValue
1724CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1725                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1726                          DebugLoc dl) {
1727  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1728
1729  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1730                       /*isVolatile*/false, /*AlwaysInline=*/true,
1731                       MachinePointerInfo(), MachinePointerInfo());
1732}
1733
1734/// IsTailCallConvention - Return true if the calling convention is one that
1735/// supports tail call optimization.
1736static bool IsTailCallConvention(CallingConv::ID CC) {
1737  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1738}
1739
1740bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1741  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
1742    return false;
1743
1744  CallSite CS(CI);
1745  CallingConv::ID CalleeCC = CS.getCallingConv();
1746  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1747    return false;
1748
1749  return true;
1750}
1751
1752/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1753/// a tailcall target by changing its ABI.
1754static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
1755                                   bool GuaranteedTailCallOpt) {
1756  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1757}
1758
1759SDValue
1760X86TargetLowering::LowerMemArgument(SDValue Chain,
1761                                    CallingConv::ID CallConv,
1762                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1763                                    DebugLoc dl, SelectionDAG &DAG,
1764                                    const CCValAssign &VA,
1765                                    MachineFrameInfo *MFI,
1766                                    unsigned i) const {
1767  // Create the nodes corresponding to a load from this parameter slot.
1768  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1769  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
1770                              getTargetMachine().Options.GuaranteedTailCallOpt);
1771  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1772  EVT ValVT;
1773
1774  // If value is passed by pointer we have address passed instead of the value
1775  // itself.
1776  if (VA.getLocInfo() == CCValAssign::Indirect)
1777    ValVT = VA.getLocVT();
1778  else
1779    ValVT = VA.getValVT();
1780
1781  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1782  // changed with more analysis.
1783  // In case of tail call optimization mark all arguments mutable. Since they
1784  // could be overwritten by lowering of arguments in case of a tail call.
1785  if (Flags.isByVal()) {
1786    unsigned Bytes = Flags.getByValSize();
1787    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1788    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
1789    return DAG.getFrameIndex(FI, getPointerTy());
1790  } else {
1791    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1792                                    VA.getLocMemOffset(), isImmutable);
1793    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1794    return DAG.getLoad(ValVT, dl, Chain, FIN,
1795                       MachinePointerInfo::getFixedStack(FI),
1796                       false, false, false, 0);
1797  }
1798}
1799
1800SDValue
1801X86TargetLowering::LowerFormalArguments(SDValue Chain,
1802                                        CallingConv::ID CallConv,
1803                                        bool isVarArg,
1804                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1805                                        DebugLoc dl,
1806                                        SelectionDAG &DAG,
1807                                        SmallVectorImpl<SDValue> &InVals)
1808                                          const {
1809  MachineFunction &MF = DAG.getMachineFunction();
1810  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1811
1812  const Function* Fn = MF.getFunction();
1813  if (Fn->hasExternalLinkage() &&
1814      Subtarget->isTargetCygMing() &&
1815      Fn->getName() == "main")
1816    FuncInfo->setForceFramePointer(true);
1817
1818  MachineFrameInfo *MFI = MF.getFrameInfo();
1819  bool Is64Bit = Subtarget->is64Bit();
1820  bool IsWindows = Subtarget->isTargetWindows();
1821  bool IsWin64 = Subtarget->isTargetWin64();
1822
1823  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1824         "Var args not supported with calling convention fastcc or ghc");
1825
1826  // Assign locations to all of the incoming arguments.
1827  SmallVector<CCValAssign, 16> ArgLocs;
1828  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1829                 ArgLocs, *DAG.getContext());
1830
1831  // Allocate shadow area for Win64
1832  if (IsWin64) {
1833    CCInfo.AllocateStack(32, 8);
1834  }
1835
1836  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
1837
1838  unsigned LastVal = ~0U;
1839  SDValue ArgValue;
1840  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1841    CCValAssign &VA = ArgLocs[i];
1842    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1843    // places.
1844    assert(VA.getValNo() != LastVal &&
1845           "Don't support value assigned to multiple locs yet");
1846    (void)LastVal;
1847    LastVal = VA.getValNo();
1848
1849    if (VA.isRegLoc()) {
1850      EVT RegVT = VA.getLocVT();
1851      const TargetRegisterClass *RC;
1852      if (RegVT == MVT::i32)
1853        RC = &X86::GR32RegClass;
1854      else if (Is64Bit && RegVT == MVT::i64)
1855        RC = &X86::GR64RegClass;
1856      else if (RegVT == MVT::f32)
1857        RC = &X86::FR32RegClass;
1858      else if (RegVT == MVT::f64)
1859        RC = &X86::FR64RegClass;
1860      else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
1861        RC = &X86::VR256RegClass;
1862      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1863        RC = &X86::VR128RegClass;
1864      else if (RegVT == MVT::x86mmx)
1865        RC = &X86::VR64RegClass;
1866      else
1867        llvm_unreachable("Unknown argument type!");
1868
1869      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1870      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1871
1872      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1873      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1874      // right size.
1875      if (VA.getLocInfo() == CCValAssign::SExt)
1876        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1877                               DAG.getValueType(VA.getValVT()));
1878      else if (VA.getLocInfo() == CCValAssign::ZExt)
1879        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1880                               DAG.getValueType(VA.getValVT()));
1881      else if (VA.getLocInfo() == CCValAssign::BCvt)
1882        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
1883
1884      if (VA.isExtInLoc()) {
1885        // Handle MMX values passed in XMM regs.
1886        if (RegVT.isVector()) {
1887          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
1888                                 ArgValue);
1889        } else
1890          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1891      }
1892    } else {
1893      assert(VA.isMemLoc());
1894      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1895    }
1896
1897    // If value is passed via pointer - do a load.
1898    if (VA.getLocInfo() == CCValAssign::Indirect)
1899      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
1900                             MachinePointerInfo(), false, false, false, 0);
1901
1902    InVals.push_back(ArgValue);
1903  }
1904
1905  // The x86-64 ABI for returning structs by value requires that we copy
1906  // the sret argument into %rax for the return. Save the argument into
1907  // a virtual register so that we can access it from the return points.
1908  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1909    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1910    unsigned Reg = FuncInfo->getSRetReturnReg();
1911    if (!Reg) {
1912      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1913      FuncInfo->setSRetReturnReg(Reg);
1914    }
1915    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1916    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1917  }
1918
1919  unsigned StackSize = CCInfo.getNextStackOffset();
1920  // Align stack specially for tail calls.
1921  if (FuncIsMadeTailCallSafe(CallConv,
1922                             MF.getTarget().Options.GuaranteedTailCallOpt))
1923    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1924
1925  // If the function takes variable number of arguments, make a frame index for
1926  // the start of the first vararg value... for expansion of llvm.va_start.
1927  if (isVarArg) {
1928    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
1929                    CallConv != CallingConv::X86_ThisCall)) {
1930      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
1931    }
1932    if (Is64Bit) {
1933      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1934
1935      // FIXME: We should really autogenerate these arrays
1936      static const uint16_t GPR64ArgRegsWin64[] = {
1937        X86::RCX, X86::RDX, X86::R8,  X86::R9
1938      };
1939      static const uint16_t GPR64ArgRegs64Bit[] = {
1940        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1941      };
1942      static const uint16_t XMMArgRegs64Bit[] = {
1943        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1944        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1945      };
1946      const uint16_t *GPR64ArgRegs;
1947      unsigned NumXMMRegs = 0;
1948
1949      if (IsWin64) {
1950        // The XMM registers which might contain var arg parameters are shadowed
1951        // in their paired GPR.  So we only need to save the GPR to their home
1952        // slots.
1953        TotalNumIntRegs = 4;
1954        GPR64ArgRegs = GPR64ArgRegsWin64;
1955      } else {
1956        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1957        GPR64ArgRegs = GPR64ArgRegs64Bit;
1958
1959        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
1960                                                TotalNumXMMRegs);
1961      }
1962      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1963                                                       TotalNumIntRegs);
1964
1965      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1966      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1967             "SSE register cannot be used when SSE is disabled!");
1968      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
1969               NoImplicitFloatOps) &&
1970             "SSE register cannot be used when SSE is disabled!");
1971      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
1972          !Subtarget->hasSSE1())
1973        // Kernel mode asks for SSE to be disabled, so don't push them
1974        // on the stack.
1975        TotalNumXMMRegs = 0;
1976
1977      if (IsWin64) {
1978        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
1979        // Get to the caller-allocated home save location.  Add 8 to account
1980        // for the return address.
1981        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
1982        FuncInfo->setRegSaveFrameIndex(
1983          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1984        // Fixup to set vararg frame on shadow area (4 x i64).
1985        if (NumIntRegs < 4)
1986          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1987      } else {
1988        // For X86-64, if there are vararg parameters that are passed via
1989        // registers, then we must store them to their spots on the stack so
1990        // they may be loaded by deferencing the result of va_next.
1991        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1992        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
1993        FuncInfo->setRegSaveFrameIndex(
1994          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
1995                               false));
1996      }
1997
1998      // Store the integer parameter registers.
1999      SmallVector<SDValue, 8> MemOps;
2000      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2001                                        getPointerTy());
2002      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2003      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2004        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2005                                  DAG.getIntPtrConstant(Offset));
2006        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2007                                     &X86::GR64RegClass);
2008        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2009        SDValue Store =
2010          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2011                       MachinePointerInfo::getFixedStack(
2012                         FuncInfo->getRegSaveFrameIndex(), Offset),
2013                       false, false, 0);
2014        MemOps.push_back(Store);
2015        Offset += 8;
2016      }
2017
2018      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2019        // Now store the XMM (fp + vector) parameter registers.
2020        SmallVector<SDValue, 11> SaveXMMOps;
2021        SaveXMMOps.push_back(Chain);
2022
2023        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2024        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2025        SaveXMMOps.push_back(ALVal);
2026
2027        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2028                               FuncInfo->getRegSaveFrameIndex()));
2029        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2030                               FuncInfo->getVarArgsFPOffset()));
2031
2032        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2033          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2034                                       &X86::VR128RegClass);
2035          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2036          SaveXMMOps.push_back(Val);
2037        }
2038        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2039                                     MVT::Other,
2040                                     &SaveXMMOps[0], SaveXMMOps.size()));
2041      }
2042
2043      if (!MemOps.empty())
2044        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2045                            &MemOps[0], MemOps.size());
2046    }
2047  }
2048
2049  // Some CCs need callee pop.
2050  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2051                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2052    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2053  } else {
2054    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2055    // If this is an sret function, the return should pop the hidden pointer.
2056    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2057        ArgsAreStructReturn(Ins))
2058      FuncInfo->setBytesToPopOnReturn(4);
2059  }
2060
2061  if (!Is64Bit) {
2062    // RegSaveFrameIndex is X86-64 only.
2063    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2064    if (CallConv == CallingConv::X86_FastCall ||
2065        CallConv == CallingConv::X86_ThisCall)
2066      // fastcc functions can't have varargs.
2067      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2068  }
2069
2070  FuncInfo->setArgumentStackSize(StackSize);
2071
2072  return Chain;
2073}
2074
2075SDValue
2076X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2077                                    SDValue StackPtr, SDValue Arg,
2078                                    DebugLoc dl, SelectionDAG &DAG,
2079                                    const CCValAssign &VA,
2080                                    ISD::ArgFlagsTy Flags) const {
2081  unsigned LocMemOffset = VA.getLocMemOffset();
2082  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2083  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2084  if (Flags.isByVal())
2085    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2086
2087  return DAG.getStore(Chain, dl, Arg, PtrOff,
2088                      MachinePointerInfo::getStack(LocMemOffset),
2089                      false, false, 0);
2090}
2091
2092/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2093/// optimization is performed and it is required.
2094SDValue
2095X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2096                                           SDValue &OutRetAddr, SDValue Chain,
2097                                           bool IsTailCall, bool Is64Bit,
2098                                           int FPDiff, DebugLoc dl) const {
2099  // Adjust the Return address stack slot.
2100  EVT VT = getPointerTy();
2101  OutRetAddr = getReturnAddressFrameIndex(DAG);
2102
2103  // Load the "old" Return address.
2104  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2105                           false, false, false, 0);
2106  return SDValue(OutRetAddr.getNode(), 1);
2107}
2108
2109/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2110/// optimization is performed and it is required (FPDiff!=0).
2111static SDValue
2112EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
2113                         SDValue Chain, SDValue RetAddrFrIdx,
2114                         bool Is64Bit, int FPDiff, DebugLoc dl) {
2115  // Store the return address to the appropriate stack slot.
2116  if (!FPDiff) return Chain;
2117  // Calculate the new stack slot for the return address.
2118  int SlotSize = Is64Bit ? 8 : 4;
2119  int NewReturnAddrFI =
2120    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
2121  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
2122  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
2123  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2124                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2125                       false, false, 0);
2126  return Chain;
2127}
2128
2129SDValue
2130X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
2131                             CallingConv::ID CallConv, bool isVarArg,
2132                             bool doesNotRet, bool &isTailCall,
2133                             const SmallVectorImpl<ISD::OutputArg> &Outs,
2134                             const SmallVectorImpl<SDValue> &OutVals,
2135                             const SmallVectorImpl<ISD::InputArg> &Ins,
2136                             DebugLoc dl, SelectionDAG &DAG,
2137                             SmallVectorImpl<SDValue> &InVals) const {
2138  MachineFunction &MF = DAG.getMachineFunction();
2139  bool Is64Bit        = Subtarget->is64Bit();
2140  bool IsWin64        = Subtarget->isTargetWin64();
2141  bool IsWindows      = Subtarget->isTargetWindows();
2142  bool IsStructRet    = CallIsStructReturn(Outs);
2143  bool IsSibcall      = false;
2144
2145  if (MF.getTarget().Options.DisableTailCalls)
2146    isTailCall = false;
2147
2148  if (isTailCall) {
2149    // Check if it's really possible to do a tail call.
2150    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2151                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
2152                                                   Outs, OutVals, Ins, DAG);
2153
2154    // Sibcalls are automatically detected tailcalls which do not require
2155    // ABI changes.
2156    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2157      IsSibcall = true;
2158
2159    if (isTailCall)
2160      ++NumTailCalls;
2161  }
2162
2163  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2164         "Var args not supported with calling convention fastcc or ghc");
2165
2166  // Analyze operands of the call, assigning locations to each operand.
2167  SmallVector<CCValAssign, 16> ArgLocs;
2168  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2169                 ArgLocs, *DAG.getContext());
2170
2171  // Allocate shadow area for Win64
2172  if (IsWin64) {
2173    CCInfo.AllocateStack(32, 8);
2174  }
2175
2176  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2177
2178  // Get a count of how many bytes are to be pushed on the stack.
2179  unsigned NumBytes = CCInfo.getNextStackOffset();
2180  if (IsSibcall)
2181    // This is a sibcall. The memory operands are available in caller's
2182    // own caller's stack.
2183    NumBytes = 0;
2184  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2185           IsTailCallConvention(CallConv))
2186    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2187
2188  int FPDiff = 0;
2189  if (isTailCall && !IsSibcall) {
2190    // Lower arguments at fp - stackoffset + fpdiff.
2191    unsigned NumBytesCallerPushed =
2192      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
2193    FPDiff = NumBytesCallerPushed - NumBytes;
2194
2195    // Set the delta of movement of the returnaddr stackslot.
2196    // But only set if delta is greater than previous delta.
2197    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
2198      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
2199  }
2200
2201  if (!IsSibcall)
2202    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
2203
2204  SDValue RetAddrFrIdx;
2205  // Load return address for tail calls.
2206  if (isTailCall && FPDiff)
2207    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2208                                    Is64Bit, FPDiff, dl);
2209
2210  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2211  SmallVector<SDValue, 8> MemOpChains;
2212  SDValue StackPtr;
2213
2214  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2215  // of tail call optimization arguments are handle later.
2216  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2217    CCValAssign &VA = ArgLocs[i];
2218    EVT RegVT = VA.getLocVT();
2219    SDValue Arg = OutVals[i];
2220    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2221    bool isByVal = Flags.isByVal();
2222
2223    // Promote the value if needed.
2224    switch (VA.getLocInfo()) {
2225    default: llvm_unreachable("Unknown loc info!");
2226    case CCValAssign::Full: break;
2227    case CCValAssign::SExt:
2228      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2229      break;
2230    case CCValAssign::ZExt:
2231      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2232      break;
2233    case CCValAssign::AExt:
2234      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
2235        // Special case: passing MMX values in XMM registers.
2236        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2237        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2238        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2239      } else
2240        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2241      break;
2242    case CCValAssign::BCvt:
2243      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2244      break;
2245    case CCValAssign::Indirect: {
2246      // Store the argument.
2247      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2248      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2249      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2250                           MachinePointerInfo::getFixedStack(FI),
2251                           false, false, 0);
2252      Arg = SpillSlot;
2253      break;
2254    }
2255    }
2256
2257    if (VA.isRegLoc()) {
2258      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2259      if (isVarArg && IsWin64) {
2260        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2261        // shadow reg if callee is a varargs function.
2262        unsigned ShadowReg = 0;
2263        switch (VA.getLocReg()) {
2264        case X86::XMM0: ShadowReg = X86::RCX; break;
2265        case X86::XMM1: ShadowReg = X86::RDX; break;
2266        case X86::XMM2: ShadowReg = X86::R8; break;
2267        case X86::XMM3: ShadowReg = X86::R9; break;
2268        }
2269        if (ShadowReg)
2270          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2271      }
2272    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2273      assert(VA.isMemLoc());
2274      if (StackPtr.getNode() == 0)
2275        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
2276      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2277                                             dl, DAG, VA, Flags));
2278    }
2279  }
2280
2281  if (!MemOpChains.empty())
2282    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2283                        &MemOpChains[0], MemOpChains.size());
2284
2285  // Build a sequence of copy-to-reg nodes chained together with token chain
2286  // and flag operands which copy the outgoing args into registers.
2287  SDValue InFlag;
2288  // Tail call byval lowering might overwrite argument registers so in case of
2289  // tail call optimization the copies to registers are lowered later.
2290  if (!isTailCall)
2291    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2292      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2293                               RegsToPass[i].second, InFlag);
2294      InFlag = Chain.getValue(1);
2295    }
2296
2297  if (Subtarget->isPICStyleGOT()) {
2298    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2299    // GOT pointer.
2300    if (!isTailCall) {
2301      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
2302                               DAG.getNode(X86ISD::GlobalBaseReg,
2303                                           DebugLoc(), getPointerTy()),
2304                               InFlag);
2305      InFlag = Chain.getValue(1);
2306    } else {
2307      // If we are tail calling and generating PIC/GOT style code load the
2308      // address of the callee into ECX. The value in ecx is used as target of
2309      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2310      // for tail calls on PIC/GOT architectures. Normally we would just put the
2311      // address of GOT into ebx and then call target@PLT. But for tail calls
2312      // ebx would be restored (since ebx is callee saved) before jumping to the
2313      // target@PLT.
2314
2315      // Note: The actual moving to ECX is done further down.
2316      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2317      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2318          !G->getGlobal()->hasProtectedVisibility())
2319        Callee = LowerGlobalAddress(Callee, DAG);
2320      else if (isa<ExternalSymbolSDNode>(Callee))
2321        Callee = LowerExternalSymbol(Callee, DAG);
2322    }
2323  }
2324
2325  if (Is64Bit && isVarArg && !IsWin64) {
2326    // From AMD64 ABI document:
2327    // For calls that may call functions that use varargs or stdargs
2328    // (prototype-less calls or calls to functions containing ellipsis (...) in
2329    // the declaration) %al is used as hidden argument to specify the number
2330    // of SSE registers used. The contents of %al do not need to match exactly
2331    // the number of registers, but must be an ubound on the number of SSE
2332    // registers used and is in the range 0 - 8 inclusive.
2333
2334    // Count the number of XMM registers allocated.
2335    static const uint16_t XMMArgRegs[] = {
2336      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2337      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2338    };
2339    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2340    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2341           && "SSE registers cannot be used when SSE is disabled");
2342
2343    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
2344                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
2345    InFlag = Chain.getValue(1);
2346  }
2347
2348
2349  // For tail calls lower the arguments to the 'real' stack slot.
2350  if (isTailCall) {
2351    // Force all the incoming stack arguments to be loaded from the stack
2352    // before any new outgoing arguments are stored to the stack, because the
2353    // outgoing stack slots may alias the incoming argument stack slots, and
2354    // the alias isn't otherwise explicit. This is slightly more conservative
2355    // than necessary, because it means that each store effectively depends
2356    // on every argument instead of just those arguments it would clobber.
2357    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2358
2359    SmallVector<SDValue, 8> MemOpChains2;
2360    SDValue FIN;
2361    int FI = 0;
2362    // Do not flag preceding copytoreg stuff together with the following stuff.
2363    InFlag = SDValue();
2364    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2365      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2366        CCValAssign &VA = ArgLocs[i];
2367        if (VA.isRegLoc())
2368          continue;
2369        assert(VA.isMemLoc());
2370        SDValue Arg = OutVals[i];
2371        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2372        // Create frame index.
2373        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2374        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2375        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2376        FIN = DAG.getFrameIndex(FI, getPointerTy());
2377
2378        if (Flags.isByVal()) {
2379          // Copy relative to framepointer.
2380          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2381          if (StackPtr.getNode() == 0)
2382            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2383                                          getPointerTy());
2384          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2385
2386          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2387                                                           ArgChain,
2388                                                           Flags, DAG, dl));
2389        } else {
2390          // Store relative to framepointer.
2391          MemOpChains2.push_back(
2392            DAG.getStore(ArgChain, dl, Arg, FIN,
2393                         MachinePointerInfo::getFixedStack(FI),
2394                         false, false, 0));
2395        }
2396      }
2397    }
2398
2399    if (!MemOpChains2.empty())
2400      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2401                          &MemOpChains2[0], MemOpChains2.size());
2402
2403    // Copy arguments to their registers.
2404    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2405      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2406                               RegsToPass[i].second, InFlag);
2407      InFlag = Chain.getValue(1);
2408    }
2409    InFlag =SDValue();
2410
2411    // Store the return address to the appropriate stack slot.
2412    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2413                                     FPDiff, dl);
2414  }
2415
2416  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2417    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2418    // In the 64-bit large code model, we have to make all calls
2419    // through a register, since the call instruction's 32-bit
2420    // pc-relative offset may not be large enough to hold the whole
2421    // address.
2422  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2423    // If the callee is a GlobalAddress node (quite common, every direct call
2424    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2425    // it.
2426
2427    // We should use extra load for direct calls to dllimported functions in
2428    // non-JIT mode.
2429    const GlobalValue *GV = G->getGlobal();
2430    if (!GV->hasDLLImportLinkage()) {
2431      unsigned char OpFlags = 0;
2432      bool ExtraLoad = false;
2433      unsigned WrapperKind = ISD::DELETED_NODE;
2434
2435      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2436      // external symbols most go through the PLT in PIC mode.  If the symbol
2437      // has hidden or protected visibility, or if it is static or local, then
2438      // we don't need to use the PLT - we can directly call it.
2439      if (Subtarget->isTargetELF() &&
2440          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2441          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2442        OpFlags = X86II::MO_PLT;
2443      } else if (Subtarget->isPICStyleStubAny() &&
2444                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2445                 (!Subtarget->getTargetTriple().isMacOSX() ||
2446                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2447        // PC-relative references to external symbols should go through $stub,
2448        // unless we're building with the leopard linker or later, which
2449        // automatically synthesizes these stubs.
2450        OpFlags = X86II::MO_DARWIN_STUB;
2451      } else if (Subtarget->isPICStyleRIPRel() &&
2452                 isa<Function>(GV) &&
2453                 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
2454        // If the function is marked as non-lazy, generate an indirect call
2455        // which loads from the GOT directly. This avoids runtime overhead
2456        // at the cost of eager binding (and one extra byte of encoding).
2457        OpFlags = X86II::MO_GOTPCREL;
2458        WrapperKind = X86ISD::WrapperRIP;
2459        ExtraLoad = true;
2460      }
2461
2462      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2463                                          G->getOffset(), OpFlags);
2464
2465      // Add a wrapper if needed.
2466      if (WrapperKind != ISD::DELETED_NODE)
2467        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2468      // Add extra indirection if needed.
2469      if (ExtraLoad)
2470        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2471                             MachinePointerInfo::getGOT(),
2472                             false, false, false, 0);
2473    }
2474  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2475    unsigned char OpFlags = 0;
2476
2477    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2478    // external symbols should go through the PLT.
2479    if (Subtarget->isTargetELF() &&
2480        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2481      OpFlags = X86II::MO_PLT;
2482    } else if (Subtarget->isPICStyleStubAny() &&
2483               (!Subtarget->getTargetTriple().isMacOSX() ||
2484                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2485      // PC-relative references to external symbols should go through $stub,
2486      // unless we're building with the leopard linker or later, which
2487      // automatically synthesizes these stubs.
2488      OpFlags = X86II::MO_DARWIN_STUB;
2489    }
2490
2491    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2492                                         OpFlags);
2493  }
2494
2495  // Returns a chain & a flag for retval copy to use.
2496  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2497  SmallVector<SDValue, 8> Ops;
2498
2499  if (!IsSibcall && isTailCall) {
2500    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2501                           DAG.getIntPtrConstant(0, true), InFlag);
2502    InFlag = Chain.getValue(1);
2503  }
2504
2505  Ops.push_back(Chain);
2506  Ops.push_back(Callee);
2507
2508  if (isTailCall)
2509    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2510
2511  // Add argument registers to the end of the list so that they are known live
2512  // into the call.
2513  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2514    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2515                                  RegsToPass[i].second.getValueType()));
2516
2517  // Add an implicit use GOT pointer in EBX.
2518  if (!isTailCall && Subtarget->isPICStyleGOT())
2519    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2520
2521  // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
2522  if (Is64Bit && isVarArg && !IsWin64)
2523    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2524
2525  // Add a register mask operand representing the call-preserved registers.
2526  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2527  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2528  assert(Mask && "Missing call preserved mask for calling convention");
2529  Ops.push_back(DAG.getRegisterMask(Mask));
2530
2531  if (InFlag.getNode())
2532    Ops.push_back(InFlag);
2533
2534  if (isTailCall) {
2535    // We used to do:
2536    //// If this is the first return lowered for this function, add the regs
2537    //// to the liveout set for the function.
2538    // This isn't right, although it's probably harmless on x86; liveouts
2539    // should be computed from returns not tail calls.  Consider a void
2540    // function making a tail call to a function returning int.
2541    return DAG.getNode(X86ISD::TC_RETURN, dl,
2542                       NodeTys, &Ops[0], Ops.size());
2543  }
2544
2545  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2546  InFlag = Chain.getValue(1);
2547
2548  // Create the CALLSEQ_END node.
2549  unsigned NumBytesForCalleeToPush;
2550  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2551                       getTargetMachine().Options.GuaranteedTailCallOpt))
2552    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2553  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2554           IsStructRet)
2555    // If this is a call to a struct-return function, the callee
2556    // pops the hidden struct pointer, so we have to push it back.
2557    // This is common for Darwin/X86, Linux & Mingw32 targets.
2558    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2559    NumBytesForCalleeToPush = 4;
2560  else
2561    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2562
2563  // Returns a flag for retval copy to use.
2564  if (!IsSibcall) {
2565    Chain = DAG.getCALLSEQ_END(Chain,
2566                               DAG.getIntPtrConstant(NumBytes, true),
2567                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2568                                                     true),
2569                               InFlag);
2570    InFlag = Chain.getValue(1);
2571  }
2572
2573  // Handle result values, copying them out of physregs into vregs that we
2574  // return.
2575  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2576                         Ins, dl, DAG, InVals);
2577}
2578
2579
2580//===----------------------------------------------------------------------===//
2581//                Fast Calling Convention (tail call) implementation
2582//===----------------------------------------------------------------------===//
2583
2584//  Like std call, callee cleans arguments, convention except that ECX is
2585//  reserved for storing the tail called function address. Only 2 registers are
2586//  free for argument passing (inreg). Tail call optimization is performed
2587//  provided:
2588//                * tailcallopt is enabled
2589//                * caller/callee are fastcc
2590//  On X86_64 architecture with GOT-style position independent code only local
2591//  (within module) calls are supported at the moment.
2592//  To keep the stack aligned according to platform abi the function
2593//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2594//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2595//  If a tail called function callee has more arguments than the caller the
2596//  caller needs to make sure that there is room to move the RETADDR to. This is
2597//  achieved by reserving an area the size of the argument delta right after the
2598//  original REtADDR, but before the saved framepointer or the spilled registers
2599//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2600//  stack layout:
2601//    arg1
2602//    arg2
2603//    RETADDR
2604//    [ new RETADDR
2605//      move area ]
2606//    (possible EBP)
2607//    ESI
2608//    EDI
2609//    local1 ..
2610
2611/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2612/// for a 16 byte align requirement.
2613unsigned
2614X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2615                                               SelectionDAG& DAG) const {
2616  MachineFunction &MF = DAG.getMachineFunction();
2617  const TargetMachine &TM = MF.getTarget();
2618  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2619  unsigned StackAlignment = TFI.getStackAlignment();
2620  uint64_t AlignMask = StackAlignment - 1;
2621  int64_t Offset = StackSize;
2622  uint64_t SlotSize = TD->getPointerSize();
2623  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2624    // Number smaller than 12 so just add the difference.
2625    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2626  } else {
2627    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2628    Offset = ((~AlignMask) & Offset) + StackAlignment +
2629      (StackAlignment-SlotSize);
2630  }
2631  return Offset;
2632}
2633
2634/// MatchingStackOffset - Return true if the given stack call argument is
2635/// already available in the same position (relatively) of the caller's
2636/// incoming argument stack.
2637static
2638bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2639                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2640                         const X86InstrInfo *TII) {
2641  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2642  int FI = INT_MAX;
2643  if (Arg.getOpcode() == ISD::CopyFromReg) {
2644    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2645    if (!TargetRegisterInfo::isVirtualRegister(VR))
2646      return false;
2647    MachineInstr *Def = MRI->getVRegDef(VR);
2648    if (!Def)
2649      return false;
2650    if (!Flags.isByVal()) {
2651      if (!TII->isLoadFromStackSlot(Def, FI))
2652        return false;
2653    } else {
2654      unsigned Opcode = Def->getOpcode();
2655      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2656          Def->getOperand(1).isFI()) {
2657        FI = Def->getOperand(1).getIndex();
2658        Bytes = Flags.getByValSize();
2659      } else
2660        return false;
2661    }
2662  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2663    if (Flags.isByVal())
2664      // ByVal argument is passed in as a pointer but it's now being
2665      // dereferenced. e.g.
2666      // define @foo(%struct.X* %A) {
2667      //   tail call @bar(%struct.X* byval %A)
2668      // }
2669      return false;
2670    SDValue Ptr = Ld->getBasePtr();
2671    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2672    if (!FINode)
2673      return false;
2674    FI = FINode->getIndex();
2675  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2676    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2677    FI = FINode->getIndex();
2678    Bytes = Flags.getByValSize();
2679  } else
2680    return false;
2681
2682  assert(FI != INT_MAX);
2683  if (!MFI->isFixedObjectIndex(FI))
2684    return false;
2685  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2686}
2687
2688/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2689/// for tail call optimization. Targets which want to do tail call
2690/// optimization should implement this function.
2691bool
2692X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2693                                                     CallingConv::ID CalleeCC,
2694                                                     bool isVarArg,
2695                                                     bool isCalleeStructRet,
2696                                                     bool isCallerStructRet,
2697                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2698                                    const SmallVectorImpl<SDValue> &OutVals,
2699                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2700                                                     SelectionDAG& DAG) const {
2701  if (!IsTailCallConvention(CalleeCC) &&
2702      CalleeCC != CallingConv::C)
2703    return false;
2704
2705  // If -tailcallopt is specified, make fastcc functions tail-callable.
2706  const MachineFunction &MF = DAG.getMachineFunction();
2707  const Function *CallerF = DAG.getMachineFunction().getFunction();
2708  CallingConv::ID CallerCC = CallerF->getCallingConv();
2709  bool CCMatch = CallerCC == CalleeCC;
2710
2711  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2712    if (IsTailCallConvention(CalleeCC) && CCMatch)
2713      return true;
2714    return false;
2715  }
2716
2717  // Look for obvious safe cases to perform tail call optimization that do not
2718  // require ABI changes. This is what gcc calls sibcall.
2719
2720  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2721  // emit a special epilogue.
2722  if (RegInfo->needsStackRealignment(MF))
2723    return false;
2724
2725  // Also avoid sibcall optimization if either caller or callee uses struct
2726  // return semantics.
2727  if (isCalleeStructRet || isCallerStructRet)
2728    return false;
2729
2730  // An stdcall caller is expected to clean up its arguments; the callee
2731  // isn't going to do that.
2732  if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
2733    return false;
2734
2735  // Do not sibcall optimize vararg calls unless all arguments are passed via
2736  // registers.
2737  if (isVarArg && !Outs.empty()) {
2738
2739    // Optimizing for varargs on Win64 is unlikely to be safe without
2740    // additional testing.
2741    if (Subtarget->isTargetWin64())
2742      return false;
2743
2744    SmallVector<CCValAssign, 16> ArgLocs;
2745    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2746                   getTargetMachine(), ArgLocs, *DAG.getContext());
2747
2748    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2749    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2750      if (!ArgLocs[i].isRegLoc())
2751        return false;
2752  }
2753
2754  // If the call result is in ST0 / ST1, it needs to be popped off the x87
2755  // stack.  Therefore, if it's not used by the call it is not safe to optimize
2756  // this into a sibcall.
2757  bool Unused = false;
2758  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2759    if (!Ins[i].Used) {
2760      Unused = true;
2761      break;
2762    }
2763  }
2764  if (Unused) {
2765    SmallVector<CCValAssign, 16> RVLocs;
2766    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
2767                   getTargetMachine(), RVLocs, *DAG.getContext());
2768    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2769    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2770      CCValAssign &VA = RVLocs[i];
2771      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2772        return false;
2773    }
2774  }
2775
2776  // If the calling conventions do not match, then we'd better make sure the
2777  // results are returned in the same way as what the caller expects.
2778  if (!CCMatch) {
2779    SmallVector<CCValAssign, 16> RVLocs1;
2780    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2781                    getTargetMachine(), RVLocs1, *DAG.getContext());
2782    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2783
2784    SmallVector<CCValAssign, 16> RVLocs2;
2785    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2786                    getTargetMachine(), RVLocs2, *DAG.getContext());
2787    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2788
2789    if (RVLocs1.size() != RVLocs2.size())
2790      return false;
2791    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2792      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2793        return false;
2794      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2795        return false;
2796      if (RVLocs1[i].isRegLoc()) {
2797        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2798          return false;
2799      } else {
2800        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2801          return false;
2802      }
2803    }
2804  }
2805
2806  // If the callee takes no arguments then go on to check the results of the
2807  // call.
2808  if (!Outs.empty()) {
2809    // Check if stack adjustment is needed. For now, do not do this if any
2810    // argument is passed on the stack.
2811    SmallVector<CCValAssign, 16> ArgLocs;
2812    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2813                   getTargetMachine(), ArgLocs, *DAG.getContext());
2814
2815    // Allocate shadow area for Win64
2816    if (Subtarget->isTargetWin64()) {
2817      CCInfo.AllocateStack(32, 8);
2818    }
2819
2820    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2821    if (CCInfo.getNextStackOffset()) {
2822      MachineFunction &MF = DAG.getMachineFunction();
2823      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2824        return false;
2825
2826      // Check if the arguments are already laid out in the right way as
2827      // the caller's fixed stack objects.
2828      MachineFrameInfo *MFI = MF.getFrameInfo();
2829      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2830      const X86InstrInfo *TII =
2831        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2832      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2833        CCValAssign &VA = ArgLocs[i];
2834        SDValue Arg = OutVals[i];
2835        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2836        if (VA.getLocInfo() == CCValAssign::Indirect)
2837          return false;
2838        if (!VA.isRegLoc()) {
2839          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2840                                   MFI, MRI, TII))
2841            return false;
2842        }
2843      }
2844    }
2845
2846    // If the tailcall address may be in a register, then make sure it's
2847    // possible to register allocate for it. In 32-bit, the call address can
2848    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2849    // callee-saved registers are restored. These happen to be the same
2850    // registers used to pass 'inreg' arguments so watch out for those.
2851    if (!Subtarget->is64Bit() &&
2852        !isa<GlobalAddressSDNode>(Callee) &&
2853        !isa<ExternalSymbolSDNode>(Callee)) {
2854      unsigned NumInRegs = 0;
2855      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2856        CCValAssign &VA = ArgLocs[i];
2857        if (!VA.isRegLoc())
2858          continue;
2859        unsigned Reg = VA.getLocReg();
2860        switch (Reg) {
2861        default: break;
2862        case X86::EAX: case X86::EDX: case X86::ECX:
2863          if (++NumInRegs == 3)
2864            return false;
2865          break;
2866        }
2867      }
2868    }
2869  }
2870
2871  return true;
2872}
2873
2874FastISel *
2875X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
2876  return X86::createFastISel(funcInfo);
2877}
2878
2879
2880//===----------------------------------------------------------------------===//
2881//                           Other Lowering Hooks
2882//===----------------------------------------------------------------------===//
2883
2884static bool MayFoldLoad(SDValue Op) {
2885  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
2886}
2887
2888static bool MayFoldIntoStore(SDValue Op) {
2889  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2890}
2891
2892static bool isTargetShuffle(unsigned Opcode) {
2893  switch(Opcode) {
2894  default: return false;
2895  case X86ISD::PSHUFD:
2896  case X86ISD::PSHUFHW:
2897  case X86ISD::PSHUFLW:
2898  case X86ISD::SHUFP:
2899  case X86ISD::PALIGN:
2900  case X86ISD::MOVLHPS:
2901  case X86ISD::MOVLHPD:
2902  case X86ISD::MOVHLPS:
2903  case X86ISD::MOVLPS:
2904  case X86ISD::MOVLPD:
2905  case X86ISD::MOVSHDUP:
2906  case X86ISD::MOVSLDUP:
2907  case X86ISD::MOVDDUP:
2908  case X86ISD::MOVSS:
2909  case X86ISD::MOVSD:
2910  case X86ISD::UNPCKL:
2911  case X86ISD::UNPCKH:
2912  case X86ISD::VPERMILP:
2913  case X86ISD::VPERM2X128:
2914    return true;
2915  }
2916}
2917
2918static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2919                                    SDValue V1, SelectionDAG &DAG) {
2920  switch(Opc) {
2921  default: llvm_unreachable("Unknown x86 shuffle node");
2922  case X86ISD::MOVSHDUP:
2923  case X86ISD::MOVSLDUP:
2924  case X86ISD::MOVDDUP:
2925    return DAG.getNode(Opc, dl, VT, V1);
2926  }
2927}
2928
2929static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2930                                    SDValue V1, unsigned TargetMask,
2931                                    SelectionDAG &DAG) {
2932  switch(Opc) {
2933  default: llvm_unreachable("Unknown x86 shuffle node");
2934  case X86ISD::PSHUFD:
2935  case X86ISD::PSHUFHW:
2936  case X86ISD::PSHUFLW:
2937  case X86ISD::VPERMILP:
2938  case X86ISD::VPERMI:
2939    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
2940  }
2941}
2942
2943static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2944                                    SDValue V1, SDValue V2, unsigned TargetMask,
2945                                    SelectionDAG &DAG) {
2946  switch(Opc) {
2947  default: llvm_unreachable("Unknown x86 shuffle node");
2948  case X86ISD::PALIGN:
2949  case X86ISD::SHUFP:
2950  case X86ISD::VPERM2X128:
2951    return DAG.getNode(Opc, dl, VT, V1, V2,
2952                       DAG.getConstant(TargetMask, MVT::i8));
2953  }
2954}
2955
2956static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2957                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
2958  switch(Opc) {
2959  default: llvm_unreachable("Unknown x86 shuffle node");
2960  case X86ISD::MOVLHPS:
2961  case X86ISD::MOVLHPD:
2962  case X86ISD::MOVHLPS:
2963  case X86ISD::MOVLPS:
2964  case X86ISD::MOVLPD:
2965  case X86ISD::MOVSS:
2966  case X86ISD::MOVSD:
2967  case X86ISD::UNPCKL:
2968  case X86ISD::UNPCKH:
2969    return DAG.getNode(Opc, dl, VT, V1, V2);
2970  }
2971}
2972
2973SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2974  MachineFunction &MF = DAG.getMachineFunction();
2975  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2976  int ReturnAddrIndex = FuncInfo->getRAIndex();
2977
2978  if (ReturnAddrIndex == 0) {
2979    // Set up a frame object for the return address.
2980    uint64_t SlotSize = TD->getPointerSize();
2981    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2982                                                           false);
2983    FuncInfo->setRAIndex(ReturnAddrIndex);
2984  }
2985
2986  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2987}
2988
2989
2990bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2991                                       bool hasSymbolicDisplacement) {
2992  // Offset should fit into 32 bit immediate field.
2993  if (!isInt<32>(Offset))
2994    return false;
2995
2996  // If we don't have a symbolic displacement - we don't have any extra
2997  // restrictions.
2998  if (!hasSymbolicDisplacement)
2999    return true;
3000
3001  // FIXME: Some tweaks might be needed for medium code model.
3002  if (M != CodeModel::Small && M != CodeModel::Kernel)
3003    return false;
3004
3005  // For small code model we assume that latest object is 16MB before end of 31
3006  // bits boundary. We may also accept pretty large negative constants knowing
3007  // that all objects are in the positive half of address space.
3008  if (M == CodeModel::Small && Offset < 16*1024*1024)
3009    return true;
3010
3011  // For kernel code model we know that all object resist in the negative half
3012  // of 32bits address space. We may not accept negative offsets, since they may
3013  // be just off and we may accept pretty large positive ones.
3014  if (M == CodeModel::Kernel && Offset > 0)
3015    return true;
3016
3017  return false;
3018}
3019
3020/// isCalleePop - Determines whether the callee is required to pop its
3021/// own arguments. Callee pop is necessary to support tail calls.
3022bool X86::isCalleePop(CallingConv::ID CallingConv,
3023                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3024  if (IsVarArg)
3025    return false;
3026
3027  switch (CallingConv) {
3028  default:
3029    return false;
3030  case CallingConv::X86_StdCall:
3031    return !is64Bit;
3032  case CallingConv::X86_FastCall:
3033    return !is64Bit;
3034  case CallingConv::X86_ThisCall:
3035    return !is64Bit;
3036  case CallingConv::Fast:
3037    return TailCallOpt;
3038  case CallingConv::GHC:
3039    return TailCallOpt;
3040  }
3041}
3042
3043/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3044/// specific condition code, returning the condition code and the LHS/RHS of the
3045/// comparison to make.
3046static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3047                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3048  if (!isFP) {
3049    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3050      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3051        // X > -1   -> X == 0, jump !sign.
3052        RHS = DAG.getConstant(0, RHS.getValueType());
3053        return X86::COND_NS;
3054      }
3055      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3056        // X < 0   -> X == 0, jump on sign.
3057        return X86::COND_S;
3058      }
3059      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3060        // X < 1   -> X <= 0
3061        RHS = DAG.getConstant(0, RHS.getValueType());
3062        return X86::COND_LE;
3063      }
3064    }
3065
3066    switch (SetCCOpcode) {
3067    default: llvm_unreachable("Invalid integer condition!");
3068    case ISD::SETEQ:  return X86::COND_E;
3069    case ISD::SETGT:  return X86::COND_G;
3070    case ISD::SETGE:  return X86::COND_GE;
3071    case ISD::SETLT:  return X86::COND_L;
3072    case ISD::SETLE:  return X86::COND_LE;
3073    case ISD::SETNE:  return X86::COND_NE;
3074    case ISD::SETULT: return X86::COND_B;
3075    case ISD::SETUGT: return X86::COND_A;
3076    case ISD::SETULE: return X86::COND_BE;
3077    case ISD::SETUGE: return X86::COND_AE;
3078    }
3079  }
3080
3081  // First determine if it is required or is profitable to flip the operands.
3082
3083  // If LHS is a foldable load, but RHS is not, flip the condition.
3084  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3085      !ISD::isNON_EXTLoad(RHS.getNode())) {
3086    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3087    std::swap(LHS, RHS);
3088  }
3089
3090  switch (SetCCOpcode) {
3091  default: break;
3092  case ISD::SETOLT:
3093  case ISD::SETOLE:
3094  case ISD::SETUGT:
3095  case ISD::SETUGE:
3096    std::swap(LHS, RHS);
3097    break;
3098  }
3099
3100  // On a floating point condition, the flags are set as follows:
3101  // ZF  PF  CF   op
3102  //  0 | 0 | 0 | X > Y
3103  //  0 | 0 | 1 | X < Y
3104  //  1 | 0 | 0 | X == Y
3105  //  1 | 1 | 1 | unordered
3106  switch (SetCCOpcode) {
3107  default: llvm_unreachable("Condcode should be pre-legalized away");
3108  case ISD::SETUEQ:
3109  case ISD::SETEQ:   return X86::COND_E;
3110  case ISD::SETOLT:              // flipped
3111  case ISD::SETOGT:
3112  case ISD::SETGT:   return X86::COND_A;
3113  case ISD::SETOLE:              // flipped
3114  case ISD::SETOGE:
3115  case ISD::SETGE:   return X86::COND_AE;
3116  case ISD::SETUGT:              // flipped
3117  case ISD::SETULT:
3118  case ISD::SETLT:   return X86::COND_B;
3119  case ISD::SETUGE:              // flipped
3120  case ISD::SETULE:
3121  case ISD::SETLE:   return X86::COND_BE;
3122  case ISD::SETONE:
3123  case ISD::SETNE:   return X86::COND_NE;
3124  case ISD::SETUO:   return X86::COND_P;
3125  case ISD::SETO:    return X86::COND_NP;
3126  case ISD::SETOEQ:
3127  case ISD::SETUNE:  return X86::COND_INVALID;
3128  }
3129}
3130
3131/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3132/// code. Current x86 isa includes the following FP cmov instructions:
3133/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3134static bool hasFPCMov(unsigned X86CC) {
3135  switch (X86CC) {
3136  default:
3137    return false;
3138  case X86::COND_B:
3139  case X86::COND_BE:
3140  case X86::COND_E:
3141  case X86::COND_P:
3142  case X86::COND_A:
3143  case X86::COND_AE:
3144  case X86::COND_NE:
3145  case X86::COND_NP:
3146    return true;
3147  }
3148}
3149
3150/// isFPImmLegal - Returns true if the target can instruction select the
3151/// specified FP immediate natively. If false, the legalizer will
3152/// materialize the FP immediate as a load from a constant pool.
3153bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3154  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3155    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3156      return true;
3157  }
3158  return false;
3159}
3160
3161/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3162/// the specified range (L, H].
3163static bool isUndefOrInRange(int Val, int Low, int Hi) {
3164  return (Val < 0) || (Val >= Low && Val < Hi);
3165}
3166
3167/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3168/// specified value.
3169static bool isUndefOrEqual(int Val, int CmpVal) {
3170  if (Val < 0 || Val == CmpVal)
3171    return true;
3172  return false;
3173}
3174
3175/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
3176/// from position Pos and ending in Pos+Size, falls within the specified
3177/// sequential range (L, L+Pos]. or is undef.
3178static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3179                                       unsigned Pos, unsigned Size, int Low) {
3180  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3181    if (!isUndefOrEqual(Mask[i], Low))
3182      return false;
3183  return true;
3184}
3185
3186/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3187/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3188/// the second operand.
3189static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
3190  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3191    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3192  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3193    return (Mask[0] < 2 && Mask[1] < 2);
3194  return false;
3195}
3196
3197/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3198/// is suitable for input to PSHUFHW.
3199static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
3200  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
3201    return false;
3202
3203  // Lower quadword copied in order or undef.
3204  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3205    return false;
3206
3207  // Upper quadword shuffled.
3208  for (unsigned i = 4; i != 8; ++i)
3209    if (!isUndefOrInRange(Mask[i], 4, 8))
3210      return false;
3211
3212  if (VT == MVT::v16i16) {
3213    // Lower quadword copied in order or undef.
3214    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3215      return false;
3216
3217    // Upper quadword shuffled.
3218    for (unsigned i = 12; i != 16; ++i)
3219      if (!isUndefOrInRange(Mask[i], 12, 16))
3220        return false;
3221  }
3222
3223  return true;
3224}
3225
3226/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3227/// is suitable for input to PSHUFLW.
3228static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
3229  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
3230    return false;
3231
3232  // Upper quadword copied in order.
3233  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3234    return false;
3235
3236  // Lower quadword shuffled.
3237  for (unsigned i = 0; i != 4; ++i)
3238    if (!isUndefOrInRange(Mask[i], 0, 4))
3239      return false;
3240
3241  if (VT == MVT::v16i16) {
3242    // Upper quadword copied in order.
3243    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3244      return false;
3245
3246    // Lower quadword shuffled.
3247    for (unsigned i = 8; i != 12; ++i)
3248      if (!isUndefOrInRange(Mask[i], 8, 12))
3249        return false;
3250  }
3251
3252  return true;
3253}
3254
3255/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3256/// is suitable for input to PALIGNR.
3257static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
3258                          const X86Subtarget *Subtarget) {
3259  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
3260      (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
3261    return false;
3262
3263  unsigned NumElts = VT.getVectorNumElements();
3264  unsigned NumLanes = VT.getSizeInBits()/128;
3265  unsigned NumLaneElts = NumElts/NumLanes;
3266
3267  // Do not handle 64-bit element shuffles with palignr.
3268  if (NumLaneElts == 2)
3269    return false;
3270
3271  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3272    unsigned i;
3273    for (i = 0; i != NumLaneElts; ++i) {
3274      if (Mask[i+l] >= 0)
3275        break;
3276    }
3277
3278    // Lane is all undef, go to next lane
3279    if (i == NumLaneElts)
3280      continue;
3281
3282    int Start = Mask[i+l];
3283
3284    // Make sure its in this lane in one of the sources
3285    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3286        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3287      return false;
3288
3289    // If not lane 0, then we must match lane 0
3290    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3291      return false;
3292
3293    // Correct second source to be contiguous with first source
3294    if (Start >= (int)NumElts)
3295      Start -= NumElts - NumLaneElts;
3296
3297    // Make sure we're shifting in the right direction.
3298    if (Start <= (int)(i+l))
3299      return false;
3300
3301    Start -= i;
3302
3303    // Check the rest of the elements to see if they are consecutive.
3304    for (++i; i != NumLaneElts; ++i) {
3305      int Idx = Mask[i+l];
3306
3307      // Make sure its in this lane
3308      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3309          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3310        return false;
3311
3312      // If not lane 0, then we must match lane 0
3313      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3314        return false;
3315
3316      if (Idx >= (int)NumElts)
3317        Idx -= NumElts - NumLaneElts;
3318
3319      if (!isUndefOrEqual(Idx, Start+i))
3320        return false;
3321
3322    }
3323  }
3324
3325  return true;
3326}
3327
3328/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3329/// the two vector operands have swapped position.
3330static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3331                                     unsigned NumElems) {
3332  for (unsigned i = 0; i != NumElems; ++i) {
3333    int idx = Mask[i];
3334    if (idx < 0)
3335      continue;
3336    else if (idx < (int)NumElems)
3337      Mask[i] = idx + NumElems;
3338    else
3339      Mask[i] = idx - NumElems;
3340  }
3341}
3342
3343/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3344/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3345/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3346/// reverse of what x86 shuffles want.
3347static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
3348                        bool Commuted = false) {
3349  if (!HasAVX && VT.getSizeInBits() == 256)
3350    return false;
3351
3352  unsigned NumElems = VT.getVectorNumElements();
3353  unsigned NumLanes = VT.getSizeInBits()/128;
3354  unsigned NumLaneElems = NumElems/NumLanes;
3355
3356  if (NumLaneElems != 2 && NumLaneElems != 4)
3357    return false;
3358
3359  // VSHUFPSY divides the resulting vector into 4 chunks.
3360  // The sources are also splitted into 4 chunks, and each destination
3361  // chunk must come from a different source chunk.
3362  //
3363  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3364  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3365  //
3366  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3367  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3368  //
3369  // VSHUFPDY divides the resulting vector into 4 chunks.
3370  // The sources are also splitted into 4 chunks, and each destination
3371  // chunk must come from a different source chunk.
3372  //
3373  //  SRC1 =>      X3       X2       X1       X0
3374  //  SRC2 =>      Y3       Y2       Y1       Y0
3375  //
3376  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3377  //
3378  unsigned HalfLaneElems = NumLaneElems/2;
3379  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3380    for (unsigned i = 0; i != NumLaneElems; ++i) {
3381      int Idx = Mask[i+l];
3382      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3383      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3384        return false;
3385      // For VSHUFPSY, the mask of the second half must be the same as the
3386      // first but with the appropriate offsets. This works in the same way as
3387      // VPERMILPS works with masks.
3388      if (NumElems != 8 || l == 0 || Mask[i] < 0)
3389        continue;
3390      if (!isUndefOrEqual(Idx, Mask[i]+l))
3391        return false;
3392    }
3393  }
3394
3395  return true;
3396}
3397
3398/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3399/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3400static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
3401  unsigned NumElems = VT.getVectorNumElements();
3402
3403  if (VT.getSizeInBits() != 128)
3404    return false;
3405
3406  if (NumElems != 4)
3407    return false;
3408
3409  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3410  return isUndefOrEqual(Mask[0], 6) &&
3411         isUndefOrEqual(Mask[1], 7) &&
3412         isUndefOrEqual(Mask[2], 2) &&
3413         isUndefOrEqual(Mask[3], 3);
3414}
3415
3416/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3417/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3418/// <2, 3, 2, 3>
3419static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
3420  unsigned NumElems = VT.getVectorNumElements();
3421
3422  if (VT.getSizeInBits() != 128)
3423    return false;
3424
3425  if (NumElems != 4)
3426    return false;
3427
3428  return isUndefOrEqual(Mask[0], 2) &&
3429         isUndefOrEqual(Mask[1], 3) &&
3430         isUndefOrEqual(Mask[2], 2) &&
3431         isUndefOrEqual(Mask[3], 3);
3432}
3433
3434/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3435/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3436static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
3437  if (VT.getSizeInBits() != 128)
3438    return false;
3439
3440  unsigned NumElems = VT.getVectorNumElements();
3441
3442  if (NumElems != 2 && NumElems != 4)
3443    return false;
3444
3445  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3446    if (!isUndefOrEqual(Mask[i], i + NumElems))
3447      return false;
3448
3449  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3450    if (!isUndefOrEqual(Mask[i], i))
3451      return false;
3452
3453  return true;
3454}
3455
3456/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3457/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3458static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
3459  unsigned NumElems = VT.getVectorNumElements();
3460
3461  if ((NumElems != 2 && NumElems != 4)
3462      || VT.getSizeInBits() > 128)
3463    return false;
3464
3465  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3466    if (!isUndefOrEqual(Mask[i], i))
3467      return false;
3468
3469  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3470    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3471      return false;
3472
3473  return true;
3474}
3475
3476/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3477/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3478static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
3479                         bool HasAVX2, bool V2IsSplat = false) {
3480  unsigned NumElts = VT.getVectorNumElements();
3481
3482  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3483         "Unsupported vector type for unpckh");
3484
3485  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3486      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3487    return false;
3488
3489  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3490  // independently on 128-bit lanes.
3491  unsigned NumLanes = VT.getSizeInBits()/128;
3492  unsigned NumLaneElts = NumElts/NumLanes;
3493
3494  for (unsigned l = 0; l != NumLanes; ++l) {
3495    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3496         i != (l+1)*NumLaneElts;
3497         i += 2, ++j) {
3498      int BitI  = Mask[i];
3499      int BitI1 = Mask[i+1];
3500      if (!isUndefOrEqual(BitI, j))
3501        return false;
3502      if (V2IsSplat) {
3503        if (!isUndefOrEqual(BitI1, NumElts))
3504          return false;
3505      } else {
3506        if (!isUndefOrEqual(BitI1, j + NumElts))
3507          return false;
3508      }
3509    }
3510  }
3511
3512  return true;
3513}
3514
3515/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3516/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3517static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
3518                         bool HasAVX2, bool V2IsSplat = false) {
3519  unsigned NumElts = VT.getVectorNumElements();
3520
3521  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3522         "Unsupported vector type for unpckh");
3523
3524  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3525      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3526    return false;
3527
3528  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3529  // independently on 128-bit lanes.
3530  unsigned NumLanes = VT.getSizeInBits()/128;
3531  unsigned NumLaneElts = NumElts/NumLanes;
3532
3533  for (unsigned l = 0; l != NumLanes; ++l) {
3534    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3535         i != (l+1)*NumLaneElts; i += 2, ++j) {
3536      int BitI  = Mask[i];
3537      int BitI1 = Mask[i+1];
3538      if (!isUndefOrEqual(BitI, j))
3539        return false;
3540      if (V2IsSplat) {
3541        if (isUndefOrEqual(BitI1, NumElts))
3542          return false;
3543      } else {
3544        if (!isUndefOrEqual(BitI1, j+NumElts))
3545          return false;
3546      }
3547    }
3548  }
3549  return true;
3550}
3551
3552/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3553/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3554/// <0, 0, 1, 1>
3555static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
3556                                  bool HasAVX2) {
3557  unsigned NumElts = VT.getVectorNumElements();
3558
3559  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3560         "Unsupported vector type for unpckh");
3561
3562  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3563      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3564    return false;
3565
3566  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
3567  // FIXME: Need a better way to get rid of this, there's no latency difference
3568  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
3569  // the former later. We should also remove the "_undef" special mask.
3570  if (NumElts == 4 && VT.getSizeInBits() == 256)
3571    return false;
3572
3573  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3574  // independently on 128-bit lanes.
3575  unsigned NumLanes = VT.getSizeInBits()/128;
3576  unsigned NumLaneElts = NumElts/NumLanes;
3577
3578  for (unsigned l = 0; l != NumLanes; ++l) {
3579    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3580         i != (l+1)*NumLaneElts;
3581         i += 2, ++j) {
3582      int BitI  = Mask[i];
3583      int BitI1 = Mask[i+1];
3584
3585      if (!isUndefOrEqual(BitI, j))
3586        return false;
3587      if (!isUndefOrEqual(BitI1, j))
3588        return false;
3589    }
3590  }
3591
3592  return true;
3593}
3594
3595/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3596/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3597/// <2, 2, 3, 3>
3598static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
3599  unsigned NumElts = VT.getVectorNumElements();
3600
3601  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3602         "Unsupported vector type for unpckh");
3603
3604  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3605      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3606    return false;
3607
3608  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3609  // independently on 128-bit lanes.
3610  unsigned NumLanes = VT.getSizeInBits()/128;
3611  unsigned NumLaneElts = NumElts/NumLanes;
3612
3613  for (unsigned l = 0; l != NumLanes; ++l) {
3614    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3615         i != (l+1)*NumLaneElts; i += 2, ++j) {
3616      int BitI  = Mask[i];
3617      int BitI1 = Mask[i+1];
3618      if (!isUndefOrEqual(BitI, j))
3619        return false;
3620      if (!isUndefOrEqual(BitI1, j))
3621        return false;
3622    }
3623  }
3624  return true;
3625}
3626
3627/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3628/// specifies a shuffle of elements that is suitable for input to MOVSS,
3629/// MOVSD, and MOVD, i.e. setting the lowest element.
3630static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
3631  if (VT.getVectorElementType().getSizeInBits() < 32)
3632    return false;
3633  if (VT.getSizeInBits() == 256)
3634    return false;
3635
3636  unsigned NumElts = VT.getVectorNumElements();
3637
3638  if (!isUndefOrEqual(Mask[0], NumElts))
3639    return false;
3640
3641  for (unsigned i = 1; i != NumElts; ++i)
3642    if (!isUndefOrEqual(Mask[i], i))
3643      return false;
3644
3645  return true;
3646}
3647
3648/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
3649/// as permutations between 128-bit chunks or halves. As an example: this
3650/// shuffle bellow:
3651///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
3652/// The first half comes from the second half of V1 and the second half from the
3653/// the second half of V2.
3654static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
3655  if (!HasAVX || VT.getSizeInBits() != 256)
3656    return false;
3657
3658  // The shuffle result is divided into half A and half B. In total the two
3659  // sources have 4 halves, namely: C, D, E, F. The final values of A and
3660  // B must come from C, D, E or F.
3661  unsigned HalfSize = VT.getVectorNumElements()/2;
3662  bool MatchA = false, MatchB = false;
3663
3664  // Check if A comes from one of C, D, E, F.
3665  for (unsigned Half = 0; Half != 4; ++Half) {
3666    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
3667      MatchA = true;
3668      break;
3669    }
3670  }
3671
3672  // Check if B comes from one of C, D, E, F.
3673  for (unsigned Half = 0; Half != 4; ++Half) {
3674    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
3675      MatchB = true;
3676      break;
3677    }
3678  }
3679
3680  return MatchA && MatchB;
3681}
3682
3683/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
3684/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
3685static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
3686  EVT VT = SVOp->getValueType(0);
3687
3688  unsigned HalfSize = VT.getVectorNumElements()/2;
3689
3690  unsigned FstHalf = 0, SndHalf = 0;
3691  for (unsigned i = 0; i < HalfSize; ++i) {
3692    if (SVOp->getMaskElt(i) > 0) {
3693      FstHalf = SVOp->getMaskElt(i)/HalfSize;
3694      break;
3695    }
3696  }
3697  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
3698    if (SVOp->getMaskElt(i) > 0) {
3699      SndHalf = SVOp->getMaskElt(i)/HalfSize;
3700      break;
3701    }
3702  }
3703
3704  return (FstHalf | (SndHalf << 4));
3705}
3706
3707/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
3708/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
3709/// Note that VPERMIL mask matching is different depending whether theunderlying
3710/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3711/// to the same elements of the low, but to the higher half of the source.
3712/// In VPERMILPD the two lanes could be shuffled independently of each other
3713/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
3714static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
3715  if (!HasAVX)
3716    return false;
3717
3718  unsigned NumElts = VT.getVectorNumElements();
3719  // Only match 256-bit with 32/64-bit types
3720  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
3721    return false;
3722
3723  unsigned NumLanes = VT.getSizeInBits()/128;
3724  unsigned LaneSize = NumElts/NumLanes;
3725  for (unsigned l = 0; l != NumElts; l += LaneSize) {
3726    for (unsigned i = 0; i != LaneSize; ++i) {
3727      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
3728        return false;
3729      if (NumElts != 8 || l == 0)
3730        continue;
3731      // VPERMILPS handling
3732      if (Mask[i] < 0)
3733        continue;
3734      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
3735        return false;
3736    }
3737  }
3738
3739  return true;
3740}
3741
3742/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
3743/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3744/// element of vector 2 and the other elements to come from vector 1 in order.
3745static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
3746                               bool V2IsSplat = false, bool V2IsUndef = false) {
3747  unsigned NumOps = VT.getVectorNumElements();
3748  if (VT.getSizeInBits() == 256)
3749    return false;
3750  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3751    return false;
3752
3753  if (!isUndefOrEqual(Mask[0], 0))
3754    return false;
3755
3756  for (unsigned i = 1; i != NumOps; ++i)
3757    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3758          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3759          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3760      return false;
3761
3762  return true;
3763}
3764
3765/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3766/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3767/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
3768static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
3769                           const X86Subtarget *Subtarget) {
3770  if (!Subtarget->hasSSE3())
3771    return false;
3772
3773  unsigned NumElems = VT.getVectorNumElements();
3774
3775  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3776      (VT.getSizeInBits() == 256 && NumElems != 8))
3777    return false;
3778
3779  // "i+1" is the value the indexed mask element must have
3780  for (unsigned i = 0; i != NumElems; i += 2)
3781    if (!isUndefOrEqual(Mask[i], i+1) ||
3782        !isUndefOrEqual(Mask[i+1], i+1))
3783      return false;
3784
3785  return true;
3786}
3787
3788/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3789/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3790/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
3791static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
3792                           const X86Subtarget *Subtarget) {
3793  if (!Subtarget->hasSSE3())
3794    return false;
3795
3796  unsigned NumElems = VT.getVectorNumElements();
3797
3798  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3799      (VT.getSizeInBits() == 256 && NumElems != 8))
3800    return false;
3801
3802  // "i" is the value the indexed mask element must have
3803  for (unsigned i = 0; i != NumElems; i += 2)
3804    if (!isUndefOrEqual(Mask[i], i) ||
3805        !isUndefOrEqual(Mask[i+1], i))
3806      return false;
3807
3808  return true;
3809}
3810
3811/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
3812/// specifies a shuffle of elements that is suitable for input to 256-bit
3813/// version of MOVDDUP.
3814static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
3815  unsigned NumElts = VT.getVectorNumElements();
3816
3817  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
3818    return false;
3819
3820  for (unsigned i = 0; i != NumElts/2; ++i)
3821    if (!isUndefOrEqual(Mask[i], 0))
3822      return false;
3823  for (unsigned i = NumElts/2; i != NumElts; ++i)
3824    if (!isUndefOrEqual(Mask[i], NumElts/2))
3825      return false;
3826  return true;
3827}
3828
3829/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3830/// specifies a shuffle of elements that is suitable for input to 128-bit
3831/// version of MOVDDUP.
3832static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
3833  if (VT.getSizeInBits() != 128)
3834    return false;
3835
3836  unsigned e = VT.getVectorNumElements() / 2;
3837  for (unsigned i = 0; i != e; ++i)
3838    if (!isUndefOrEqual(Mask[i], i))
3839      return false;
3840  for (unsigned i = 0; i != e; ++i)
3841    if (!isUndefOrEqual(Mask[e+i], i))
3842      return false;
3843  return true;
3844}
3845
3846/// isVEXTRACTF128Index - Return true if the specified
3847/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
3848/// suitable for input to VEXTRACTF128.
3849bool X86::isVEXTRACTF128Index(SDNode *N) {
3850  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3851    return false;
3852
3853  // The index should be aligned on a 128-bit boundary.
3854  uint64_t Index =
3855    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3856
3857  unsigned VL = N->getValueType(0).getVectorNumElements();
3858  unsigned VBits = N->getValueType(0).getSizeInBits();
3859  unsigned ElSize = VBits / VL;
3860  bool Result = (Index * ElSize) % 128 == 0;
3861
3862  return Result;
3863}
3864
3865/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
3866/// operand specifies a subvector insert that is suitable for input to
3867/// VINSERTF128.
3868bool X86::isVINSERTF128Index(SDNode *N) {
3869  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
3870    return false;
3871
3872  // The index should be aligned on a 128-bit boundary.
3873  uint64_t Index =
3874    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
3875
3876  unsigned VL = N->getValueType(0).getVectorNumElements();
3877  unsigned VBits = N->getValueType(0).getSizeInBits();
3878  unsigned ElSize = VBits / VL;
3879  bool Result = (Index * ElSize) % 128 == 0;
3880
3881  return Result;
3882}
3883
3884/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3885/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3886/// Handles 128-bit and 256-bit.
3887static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
3888  EVT VT = N->getValueType(0);
3889
3890  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3891         "Unsupported vector type for PSHUF/SHUFP");
3892
3893  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
3894  // independently on 128-bit lanes.
3895  unsigned NumElts = VT.getVectorNumElements();
3896  unsigned NumLanes = VT.getSizeInBits()/128;
3897  unsigned NumLaneElts = NumElts/NumLanes;
3898
3899  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
3900         "Only supports 2 or 4 elements per lane");
3901
3902  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
3903  unsigned Mask = 0;
3904  for (unsigned i = 0; i != NumElts; ++i) {
3905    int Elt = N->getMaskElt(i);
3906    if (Elt < 0) continue;
3907    Elt &= NumLaneElts - 1;
3908    unsigned ShAmt = (i << Shift) % 8;
3909    Mask |= Elt << ShAmt;
3910  }
3911
3912  return Mask;
3913}
3914
3915/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3916/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3917static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
3918  EVT VT = N->getValueType(0);
3919
3920  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
3921         "Unsupported vector type for PSHUFHW");
3922
3923  unsigned NumElts = VT.getVectorNumElements();
3924
3925  unsigned Mask = 0;
3926  for (unsigned l = 0; l != NumElts; l += 8) {
3927    // 8 nodes per lane, but we only care about the last 4.
3928    for (unsigned i = 0; i < 4; ++i) {
3929      int Elt = N->getMaskElt(l+i+4);
3930      if (Elt < 0) continue;
3931      Elt &= 0x3; // only 2-bits.
3932      Mask |= Elt << (i * 2);
3933    }
3934  }
3935
3936  return Mask;
3937}
3938
3939/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3940/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3941static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
3942  EVT VT = N->getValueType(0);
3943
3944  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
3945         "Unsupported vector type for PSHUFHW");
3946
3947  unsigned NumElts = VT.getVectorNumElements();
3948
3949  unsigned Mask = 0;
3950  for (unsigned l = 0; l != NumElts; l += 8) {
3951    // 8 nodes per lane, but we only care about the first 4.
3952    for (unsigned i = 0; i < 4; ++i) {
3953      int Elt = N->getMaskElt(l+i);
3954      if (Elt < 0) continue;
3955      Elt &= 0x3; // only 2-bits
3956      Mask |= Elt << (i * 2);
3957    }
3958  }
3959
3960  return Mask;
3961}
3962
3963/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3964/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3965static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
3966  EVT VT = SVOp->getValueType(0);
3967  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
3968
3969  unsigned NumElts = VT.getVectorNumElements();
3970  unsigned NumLanes = VT.getSizeInBits()/128;
3971  unsigned NumLaneElts = NumElts/NumLanes;
3972
3973  int Val = 0;
3974  unsigned i;
3975  for (i = 0; i != NumElts; ++i) {
3976    Val = SVOp->getMaskElt(i);
3977    if (Val >= 0)
3978      break;
3979  }
3980  if (Val >= (int)NumElts)
3981    Val -= NumElts - NumLaneElts;
3982
3983  assert(Val - i > 0 && "PALIGNR imm should be positive");
3984  return (Val - i) * EltSize;
3985}
3986
3987/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
3988/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
3989/// instructions.
3990unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
3991  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3992    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
3993
3994  uint64_t Index =
3995    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3996
3997  EVT VecVT = N->getOperand(0).getValueType();
3998  EVT ElVT = VecVT.getVectorElementType();
3999
4000  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4001  return Index / NumElemsPerChunk;
4002}
4003
4004/// getInsertVINSERTF128Immediate - Return the appropriate immediate
4005/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4006/// instructions.
4007unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
4008  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4009    llvm_unreachable("Illegal insert subvector for VINSERTF128");
4010
4011  uint64_t Index =
4012    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4013
4014  EVT VecVT = N->getValueType(0);
4015  EVT ElVT = VecVT.getVectorElementType();
4016
4017  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4018  return Index / NumElemsPerChunk;
4019}
4020
4021/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
4022/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
4023/// Handles 256-bit.
4024static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
4025  EVT VT = N->getValueType(0);
4026
4027  unsigned NumElts = VT.getVectorNumElements();
4028
4029  assert((VT.is256BitVector() && NumElts == 4) &&
4030         "Unsupported vector type for VPERMQ/VPERMPD");
4031
4032  unsigned Mask = 0;
4033  for (unsigned i = 0; i != NumElts; ++i) {
4034    int Elt = N->getMaskElt(i);
4035    if (Elt < 0)
4036      continue;
4037    Mask |= Elt << (i*2);
4038  }
4039
4040  return Mask;
4041}
4042/// isZeroNode - Returns true if Elt is a constant zero or a floating point
4043/// constant +0.0.
4044bool X86::isZeroNode(SDValue Elt) {
4045  return ((isa<ConstantSDNode>(Elt) &&
4046           cast<ConstantSDNode>(Elt)->isNullValue()) ||
4047          (isa<ConstantFPSDNode>(Elt) &&
4048           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
4049}
4050
4051/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4052/// their permute mask.
4053static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
4054                                    SelectionDAG &DAG) {
4055  EVT VT = SVOp->getValueType(0);
4056  unsigned NumElems = VT.getVectorNumElements();
4057  SmallVector<int, 8> MaskVec;
4058
4059  for (unsigned i = 0; i != NumElems; ++i) {
4060    int idx = SVOp->getMaskElt(i);
4061    if (idx < 0)
4062      MaskVec.push_back(idx);
4063    else if (idx < (int)NumElems)
4064      MaskVec.push_back(idx + NumElems);
4065    else
4066      MaskVec.push_back(idx - NumElems);
4067  }
4068  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
4069                              SVOp->getOperand(0), &MaskVec[0]);
4070}
4071
4072/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4073/// match movhlps. The lower half elements should come from upper half of
4074/// V1 (and in order), and the upper half elements should come from the upper
4075/// half of V2 (and in order).
4076static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
4077  if (VT.getSizeInBits() != 128)
4078    return false;
4079  if (VT.getVectorNumElements() != 4)
4080    return false;
4081  for (unsigned i = 0, e = 2; i != e; ++i)
4082    if (!isUndefOrEqual(Mask[i], i+2))
4083      return false;
4084  for (unsigned i = 2; i != 4; ++i)
4085    if (!isUndefOrEqual(Mask[i], i+4))
4086      return false;
4087  return true;
4088}
4089
4090/// isScalarLoadToVector - Returns true if the node is a scalar load that
4091/// is promoted to a vector. It also returns the LoadSDNode by reference if
4092/// required.
4093static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4094  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4095    return false;
4096  N = N->getOperand(0).getNode();
4097  if (!ISD::isNON_EXTLoad(N))
4098    return false;
4099  if (LD)
4100    *LD = cast<LoadSDNode>(N);
4101  return true;
4102}
4103
4104// Test whether the given value is a vector value which will be legalized
4105// into a load.
4106static bool WillBeConstantPoolLoad(SDNode *N) {
4107  if (N->getOpcode() != ISD::BUILD_VECTOR)
4108    return false;
4109
4110  // Check for any non-constant elements.
4111  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4112    switch (N->getOperand(i).getNode()->getOpcode()) {
4113    case ISD::UNDEF:
4114    case ISD::ConstantFP:
4115    case ISD::Constant:
4116      break;
4117    default:
4118      return false;
4119    }
4120
4121  // Vectors of all-zeros and all-ones are materialized with special
4122  // instructions rather than being loaded.
4123  return !ISD::isBuildVectorAllZeros(N) &&
4124         !ISD::isBuildVectorAllOnes(N);
4125}
4126
4127/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4128/// match movlp{s|d}. The lower half elements should come from lower half of
4129/// V1 (and in order), and the upper half elements should come from the upper
4130/// half of V2 (and in order). And since V1 will become the source of the
4131/// MOVLP, it must be either a vector load or a scalar load to vector.
4132static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
4133                               ArrayRef<int> Mask, EVT VT) {
4134  if (VT.getSizeInBits() != 128)
4135    return false;
4136
4137  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4138    return false;
4139  // Is V2 is a vector load, don't do this transformation. We will try to use
4140  // load folding shufps op.
4141  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
4142    return false;
4143
4144  unsigned NumElems = VT.getVectorNumElements();
4145
4146  if (NumElems != 2 && NumElems != 4)
4147    return false;
4148  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4149    if (!isUndefOrEqual(Mask[i], i))
4150      return false;
4151  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4152    if (!isUndefOrEqual(Mask[i], i+NumElems))
4153      return false;
4154  return true;
4155}
4156
4157/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4158/// all the same.
4159static bool isSplatVector(SDNode *N) {
4160  if (N->getOpcode() != ISD::BUILD_VECTOR)
4161    return false;
4162
4163  SDValue SplatValue = N->getOperand(0);
4164  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4165    if (N->getOperand(i) != SplatValue)
4166      return false;
4167  return true;
4168}
4169
4170/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4171/// to an zero vector.
4172/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4173static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4174  SDValue V1 = N->getOperand(0);
4175  SDValue V2 = N->getOperand(1);
4176  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4177  for (unsigned i = 0; i != NumElems; ++i) {
4178    int Idx = N->getMaskElt(i);
4179    if (Idx >= (int)NumElems) {
4180      unsigned Opc = V2.getOpcode();
4181      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4182        continue;
4183      if (Opc != ISD::BUILD_VECTOR ||
4184          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4185        return false;
4186    } else if (Idx >= 0) {
4187      unsigned Opc = V1.getOpcode();
4188      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4189        continue;
4190      if (Opc != ISD::BUILD_VECTOR ||
4191          !X86::isZeroNode(V1.getOperand(Idx)))
4192        return false;
4193    }
4194  }
4195  return true;
4196}
4197
4198/// getZeroVector - Returns a vector of specified type with all zero elements.
4199///
4200static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4201                             SelectionDAG &DAG, DebugLoc dl) {
4202  assert(VT.isVector() && "Expected a vector type");
4203  unsigned Size = VT.getSizeInBits();
4204
4205  // Always build SSE zero vectors as <4 x i32> bitcasted
4206  // to their dest type. This ensures they get CSE'd.
4207  SDValue Vec;
4208  if (Size == 128) {  // SSE
4209    if (Subtarget->hasSSE2()) {  // SSE2
4210      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4211      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4212    } else { // SSE1
4213      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4214      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4215    }
4216  } else if (Size == 256) { // AVX
4217    if (Subtarget->hasAVX2()) { // AVX2
4218      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4219      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4220      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4221    } else {
4222      // 256-bit logic and arithmetic instructions in AVX are all
4223      // floating-point, no support for integer ops. Emit fp zeroed vectors.
4224      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4225      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4226      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
4227    }
4228  } else
4229    llvm_unreachable("Unexpected vector type");
4230
4231  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4232}
4233
4234/// getOnesVector - Returns a vector of specified type with all bits set.
4235/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4236/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4237/// Then bitcast to their original type, ensuring they get CSE'd.
4238static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
4239                             DebugLoc dl) {
4240  assert(VT.isVector() && "Expected a vector type");
4241  unsigned Size = VT.getSizeInBits();
4242
4243  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4244  SDValue Vec;
4245  if (Size == 256) {
4246    if (HasAVX2) { // AVX2
4247      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4248      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4249    } else { // AVX
4250      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4251      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4252    }
4253  } else if (Size == 128) {
4254    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4255  } else
4256    llvm_unreachable("Unexpected vector type");
4257
4258  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4259}
4260
4261/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4262/// that point to V2 points to its first element.
4263static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4264  for (unsigned i = 0; i != NumElems; ++i) {
4265    if (Mask[i] > (int)NumElems) {
4266      Mask[i] = NumElems;
4267    }
4268  }
4269}
4270
4271/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4272/// operation of specified width.
4273static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4274                       SDValue V2) {
4275  unsigned NumElems = VT.getVectorNumElements();
4276  SmallVector<int, 8> Mask;
4277  Mask.push_back(NumElems);
4278  for (unsigned i = 1; i != NumElems; ++i)
4279    Mask.push_back(i);
4280  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4281}
4282
4283/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4284static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4285                          SDValue V2) {
4286  unsigned NumElems = VT.getVectorNumElements();
4287  SmallVector<int, 8> Mask;
4288  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4289    Mask.push_back(i);
4290    Mask.push_back(i + NumElems);
4291  }
4292  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4293}
4294
4295/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4296static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4297                          SDValue V2) {
4298  unsigned NumElems = VT.getVectorNumElements();
4299  SmallVector<int, 8> Mask;
4300  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4301    Mask.push_back(i + Half);
4302    Mask.push_back(i + NumElems + Half);
4303  }
4304  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4305}
4306
4307// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4308// a generic shuffle instruction because the target has no such instructions.
4309// Generate shuffles which repeat i16 and i8 several times until they can be
4310// represented by v4f32 and then be manipulated by target suported shuffles.
4311static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4312  EVT VT = V.getValueType();
4313  int NumElems = VT.getVectorNumElements();
4314  DebugLoc dl = V.getDebugLoc();
4315
4316  while (NumElems > 4) {
4317    if (EltNo < NumElems/2) {
4318      V = getUnpackl(DAG, dl, VT, V, V);
4319    } else {
4320      V = getUnpackh(DAG, dl, VT, V, V);
4321      EltNo -= NumElems/2;
4322    }
4323    NumElems >>= 1;
4324  }
4325  return V;
4326}
4327
4328/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4329static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4330  EVT VT = V.getValueType();
4331  DebugLoc dl = V.getDebugLoc();
4332  unsigned Size = VT.getSizeInBits();
4333
4334  if (Size == 128) {
4335    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4336    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4337    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4338                             &SplatMask[0]);
4339  } else if (Size == 256) {
4340    // To use VPERMILPS to splat scalars, the second half of indicies must
4341    // refer to the higher part, which is a duplication of the lower one,
4342    // because VPERMILPS can only handle in-lane permutations.
4343    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4344                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4345
4346    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4347    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4348                             &SplatMask[0]);
4349  } else
4350    llvm_unreachable("Vector size not supported");
4351
4352  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4353}
4354
4355/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4356static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4357  EVT SrcVT = SV->getValueType(0);
4358  SDValue V1 = SV->getOperand(0);
4359  DebugLoc dl = SV->getDebugLoc();
4360
4361  int EltNo = SV->getSplatIndex();
4362  int NumElems = SrcVT.getVectorNumElements();
4363  unsigned Size = SrcVT.getSizeInBits();
4364
4365  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
4366          "Unknown how to promote splat for type");
4367
4368  // Extract the 128-bit part containing the splat element and update
4369  // the splat element index when it refers to the higher register.
4370  if (Size == 256) {
4371    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4372    if (EltNo >= NumElems/2)
4373      EltNo -= NumElems/2;
4374  }
4375
4376  // All i16 and i8 vector types can't be used directly by a generic shuffle
4377  // instruction because the target has no such instruction. Generate shuffles
4378  // which repeat i16 and i8 several times until they fit in i32, and then can
4379  // be manipulated by target suported shuffles.
4380  EVT EltVT = SrcVT.getVectorElementType();
4381  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4382    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4383
4384  // Recreate the 256-bit vector and place the same 128-bit vector
4385  // into the low and high part. This is necessary because we want
4386  // to use VPERM* to shuffle the vectors
4387  if (Size == 256) {
4388    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4389  }
4390
4391  return getLegalSplat(DAG, V1, EltNo);
4392}
4393
4394/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4395/// vector of zero or undef vector.  This produces a shuffle where the low
4396/// element of V2 is swizzled into the zero/undef vector, landing at element
4397/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4398static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4399                                           bool IsZero,
4400                                           const X86Subtarget *Subtarget,
4401                                           SelectionDAG &DAG) {
4402  EVT VT = V2.getValueType();
4403  SDValue V1 = IsZero
4404    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
4405  unsigned NumElems = VT.getVectorNumElements();
4406  SmallVector<int, 16> MaskVec;
4407  for (unsigned i = 0; i != NumElems; ++i)
4408    // If this is the insertion idx, put the low elt of V2 here.
4409    MaskVec.push_back(i == Idx ? NumElems : i);
4410  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
4411}
4412
4413/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4414/// target specific opcode. Returns true if the Mask could be calculated.
4415/// Sets IsUnary to true if only uses one source.
4416static bool getTargetShuffleMask(SDNode *N, EVT VT,
4417                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4418  unsigned NumElems = VT.getVectorNumElements();
4419  SDValue ImmN;
4420
4421  IsUnary = false;
4422  switch(N->getOpcode()) {
4423  case X86ISD::SHUFP:
4424    ImmN = N->getOperand(N->getNumOperands()-1);
4425    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4426    break;
4427  case X86ISD::UNPCKH:
4428    DecodeUNPCKHMask(VT, Mask);
4429    break;
4430  case X86ISD::UNPCKL:
4431    DecodeUNPCKLMask(VT, Mask);
4432    break;
4433  case X86ISD::MOVHLPS:
4434    DecodeMOVHLPSMask(NumElems, Mask);
4435    break;
4436  case X86ISD::MOVLHPS:
4437    DecodeMOVLHPSMask(NumElems, Mask);
4438    break;
4439  case X86ISD::PSHUFD:
4440  case X86ISD::VPERMILP:
4441    ImmN = N->getOperand(N->getNumOperands()-1);
4442    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4443    IsUnary = true;
4444    break;
4445  case X86ISD::PSHUFHW:
4446    ImmN = N->getOperand(N->getNumOperands()-1);
4447    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4448    IsUnary = true;
4449    break;
4450  case X86ISD::PSHUFLW:
4451    ImmN = N->getOperand(N->getNumOperands()-1);
4452    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4453    IsUnary = true;
4454    break;
4455  case X86ISD::MOVSS:
4456  case X86ISD::MOVSD: {
4457    // The index 0 always comes from the first element of the second source,
4458    // this is why MOVSS and MOVSD are used in the first place. The other
4459    // elements come from the other positions of the first source vector
4460    Mask.push_back(NumElems);
4461    for (unsigned i = 1; i != NumElems; ++i) {
4462      Mask.push_back(i);
4463    }
4464    break;
4465  }
4466  case X86ISD::VPERM2X128:
4467    ImmN = N->getOperand(N->getNumOperands()-1);
4468    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4469    if (Mask.empty()) return false;
4470    break;
4471  case X86ISD::MOVDDUP:
4472  case X86ISD::MOVLHPD:
4473  case X86ISD::MOVLPD:
4474  case X86ISD::MOVLPS:
4475  case X86ISD::MOVSHDUP:
4476  case X86ISD::MOVSLDUP:
4477  case X86ISD::PALIGN:
4478    // Not yet implemented
4479    return false;
4480  default: llvm_unreachable("unknown target shuffle node");
4481  }
4482
4483  return true;
4484}
4485
4486/// getShuffleScalarElt - Returns the scalar element that will make up the ith
4487/// element of the result of the vector shuffle.
4488static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
4489                                   unsigned Depth) {
4490  if (Depth == 6)
4491    return SDValue();  // Limit search depth.
4492
4493  SDValue V = SDValue(N, 0);
4494  EVT VT = V.getValueType();
4495  unsigned Opcode = V.getOpcode();
4496
4497  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4498  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4499    int Elt = SV->getMaskElt(Index);
4500
4501    if (Elt < 0)
4502      return DAG.getUNDEF(VT.getVectorElementType());
4503
4504    unsigned NumElems = VT.getVectorNumElements();
4505    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
4506                                         : SV->getOperand(1);
4507    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
4508  }
4509
4510  // Recurse into target specific vector shuffles to find scalars.
4511  if (isTargetShuffle(Opcode)) {
4512    unsigned NumElems = VT.getVectorNumElements();
4513    SmallVector<int, 16> ShuffleMask;
4514    SDValue ImmN;
4515    bool IsUnary;
4516
4517    if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
4518      return SDValue();
4519
4520    int Elt = ShuffleMask[Index];
4521    if (Elt < 0)
4522      return DAG.getUNDEF(VT.getVectorElementType());
4523
4524    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
4525                                           : N->getOperand(1);
4526    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
4527                               Depth+1);
4528  }
4529
4530  // Actual nodes that may contain scalar elements
4531  if (Opcode == ISD::BITCAST) {
4532    V = V.getOperand(0);
4533    EVT SrcVT = V.getValueType();
4534    unsigned NumElems = VT.getVectorNumElements();
4535
4536    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4537      return SDValue();
4538  }
4539
4540  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4541    return (Index == 0) ? V.getOperand(0)
4542                        : DAG.getUNDEF(VT.getVectorElementType());
4543
4544  if (V.getOpcode() == ISD::BUILD_VECTOR)
4545    return V.getOperand(Index);
4546
4547  return SDValue();
4548}
4549
4550/// getNumOfConsecutiveZeros - Return the number of elements of a vector
4551/// shuffle operation which come from a consecutively from a zero. The
4552/// search can start in two different directions, from left or right.
4553static
4554unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
4555                                  bool ZerosFromLeft, SelectionDAG &DAG) {
4556  unsigned i;
4557  for (i = 0; i != NumElems; ++i) {
4558    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
4559    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
4560    if (!(Elt.getNode() &&
4561         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
4562      break;
4563  }
4564
4565  return i;
4566}
4567
4568/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
4569/// correspond consecutively to elements from one of the vector operands,
4570/// starting from its index OpIdx. Also tell OpNum which source vector operand.
4571static
4572bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
4573                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
4574                              unsigned NumElems, unsigned &OpNum) {
4575  bool SeenV1 = false;
4576  bool SeenV2 = false;
4577
4578  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
4579    int Idx = SVOp->getMaskElt(i);
4580    // Ignore undef indicies
4581    if (Idx < 0)
4582      continue;
4583
4584    if (Idx < (int)NumElems)
4585      SeenV1 = true;
4586    else
4587      SeenV2 = true;
4588
4589    // Only accept consecutive elements from the same vector
4590    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
4591      return false;
4592  }
4593
4594  OpNum = SeenV1 ? 0 : 1;
4595  return true;
4596}
4597
4598/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
4599/// logical left shift of a vector.
4600static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4601                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4602  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4603  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4604              false /* check zeros from right */, DAG);
4605  unsigned OpSrc;
4606
4607  if (!NumZeros)
4608    return false;
4609
4610  // Considering the elements in the mask that are not consecutive zeros,
4611  // check if they consecutively come from only one of the source vectors.
4612  //
4613  //               V1 = {X, A, B, C}     0
4614  //                         \  \  \    /
4615  //   vector_shuffle V1, V2 <1, 2, 3, X>
4616  //
4617  if (!isShuffleMaskConsecutive(SVOp,
4618            0,                   // Mask Start Index
4619            NumElems-NumZeros,   // Mask End Index(exclusive)
4620            NumZeros,            // Where to start looking in the src vector
4621            NumElems,            // Number of elements in vector
4622            OpSrc))              // Which source operand ?
4623    return false;
4624
4625  isLeft = false;
4626  ShAmt = NumZeros;
4627  ShVal = SVOp->getOperand(OpSrc);
4628  return true;
4629}
4630
4631/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
4632/// logical left shift of a vector.
4633static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4634                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4635  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4636  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4637              true /* check zeros from left */, DAG);
4638  unsigned OpSrc;
4639
4640  if (!NumZeros)
4641    return false;
4642
4643  // Considering the elements in the mask that are not consecutive zeros,
4644  // check if they consecutively come from only one of the source vectors.
4645  //
4646  //                           0    { A, B, X, X } = V2
4647  //                          / \    /  /
4648  //   vector_shuffle V1, V2 <X, X, 4, 5>
4649  //
4650  if (!isShuffleMaskConsecutive(SVOp,
4651            NumZeros,     // Mask Start Index
4652            NumElems,     // Mask End Index(exclusive)
4653            0,            // Where to start looking in the src vector
4654            NumElems,     // Number of elements in vector
4655            OpSrc))       // Which source operand ?
4656    return false;
4657
4658  isLeft = true;
4659  ShAmt = NumZeros;
4660  ShVal = SVOp->getOperand(OpSrc);
4661  return true;
4662}
4663
4664/// isVectorShift - Returns true if the shuffle can be implemented as a
4665/// logical left or right shift of a vector.
4666static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4667                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4668  // Although the logic below support any bitwidth size, there are no
4669  // shift instructions which handle more than 128-bit vectors.
4670  if (SVOp->getValueType(0).getSizeInBits() > 128)
4671    return false;
4672
4673  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
4674      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
4675    return true;
4676
4677  return false;
4678}
4679
4680/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4681///
4682static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4683                                       unsigned NumNonZero, unsigned NumZero,
4684                                       SelectionDAG &DAG,
4685                                       const X86Subtarget* Subtarget,
4686                                       const TargetLowering &TLI) {
4687  if (NumNonZero > 8)
4688    return SDValue();
4689
4690  DebugLoc dl = Op.getDebugLoc();
4691  SDValue V(0, 0);
4692  bool First = true;
4693  for (unsigned i = 0; i < 16; ++i) {
4694    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4695    if (ThisIsNonZero && First) {
4696      if (NumZero)
4697        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4698      else
4699        V = DAG.getUNDEF(MVT::v8i16);
4700      First = false;
4701    }
4702
4703    if ((i & 1) != 0) {
4704      SDValue ThisElt(0, 0), LastElt(0, 0);
4705      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4706      if (LastIsNonZero) {
4707        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4708                              MVT::i16, Op.getOperand(i-1));
4709      }
4710      if (ThisIsNonZero) {
4711        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4712        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4713                              ThisElt, DAG.getConstant(8, MVT::i8));
4714        if (LastIsNonZero)
4715          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4716      } else
4717        ThisElt = LastElt;
4718
4719      if (ThisElt.getNode())
4720        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4721                        DAG.getIntPtrConstant(i/2));
4722    }
4723  }
4724
4725  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
4726}
4727
4728/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4729///
4730static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4731                                     unsigned NumNonZero, unsigned NumZero,
4732                                     SelectionDAG &DAG,
4733                                     const X86Subtarget* Subtarget,
4734                                     const TargetLowering &TLI) {
4735  if (NumNonZero > 4)
4736    return SDValue();
4737
4738  DebugLoc dl = Op.getDebugLoc();
4739  SDValue V(0, 0);
4740  bool First = true;
4741  for (unsigned i = 0; i < 8; ++i) {
4742    bool isNonZero = (NonZeros & (1 << i)) != 0;
4743    if (isNonZero) {
4744      if (First) {
4745        if (NumZero)
4746          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4747        else
4748          V = DAG.getUNDEF(MVT::v8i16);
4749        First = false;
4750      }
4751      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4752                      MVT::v8i16, V, Op.getOperand(i),
4753                      DAG.getIntPtrConstant(i));
4754    }
4755  }
4756
4757  return V;
4758}
4759
4760/// getVShift - Return a vector logical shift node.
4761///
4762static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4763                         unsigned NumBits, SelectionDAG &DAG,
4764                         const TargetLowering &TLI, DebugLoc dl) {
4765  assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
4766  EVT ShVT = MVT::v2i64;
4767  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
4768  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
4769  return DAG.getNode(ISD::BITCAST, dl, VT,
4770                     DAG.getNode(Opc, dl, ShVT, SrcOp,
4771                             DAG.getConstant(NumBits,
4772                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
4773}
4774
4775SDValue
4776X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
4777                                          SelectionDAG &DAG) const {
4778
4779  // Check if the scalar load can be widened into a vector load. And if
4780  // the address is "base + cst" see if the cst can be "absorbed" into
4781  // the shuffle mask.
4782  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4783    SDValue Ptr = LD->getBasePtr();
4784    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4785      return SDValue();
4786    EVT PVT = LD->getValueType(0);
4787    if (PVT != MVT::i32 && PVT != MVT::f32)
4788      return SDValue();
4789
4790    int FI = -1;
4791    int64_t Offset = 0;
4792    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4793      FI = FINode->getIndex();
4794      Offset = 0;
4795    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
4796               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4797      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4798      Offset = Ptr.getConstantOperandVal(1);
4799      Ptr = Ptr.getOperand(0);
4800    } else {
4801      return SDValue();
4802    }
4803
4804    // FIXME: 256-bit vector instructions don't require a strict alignment,
4805    // improve this code to support it better.
4806    unsigned RequiredAlign = VT.getSizeInBits()/8;
4807    SDValue Chain = LD->getChain();
4808    // Make sure the stack object alignment is at least 16 or 32.
4809    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4810    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
4811      if (MFI->isFixedObjectIndex(FI)) {
4812        // Can't change the alignment. FIXME: It's possible to compute
4813        // the exact stack offset and reference FI + adjust offset instead.
4814        // If someone *really* cares about this. That's the way to implement it.
4815        return SDValue();
4816      } else {
4817        MFI->setObjectAlignment(FI, RequiredAlign);
4818      }
4819    }
4820
4821    // (Offset % 16 or 32) must be multiple of 4. Then address is then
4822    // Ptr + (Offset & ~15).
4823    if (Offset < 0)
4824      return SDValue();
4825    if ((Offset % RequiredAlign) & 3)
4826      return SDValue();
4827    int64_t StartOffset = Offset & ~(RequiredAlign-1);
4828    if (StartOffset)
4829      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
4830                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
4831
4832    int EltNo = (Offset - StartOffset) >> 2;
4833    unsigned NumElems = VT.getVectorNumElements();
4834
4835    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
4836    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
4837                             LD->getPointerInfo().getWithOffset(StartOffset),
4838                             false, false, false, 0);
4839
4840    SmallVector<int, 8> Mask;
4841    for (unsigned i = 0; i != NumElems; ++i)
4842      Mask.push_back(EltNo);
4843
4844    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
4845  }
4846
4847  return SDValue();
4848}
4849
4850/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
4851/// vector of type 'VT', see if the elements can be replaced by a single large
4852/// load which has the same value as a build_vector whose operands are 'elts'.
4853///
4854/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
4855///
4856/// FIXME: we'd also like to handle the case where the last elements are zero
4857/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
4858/// There's even a handy isZeroNode for that purpose.
4859static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
4860                                        DebugLoc &DL, SelectionDAG &DAG) {
4861  EVT EltVT = VT.getVectorElementType();
4862  unsigned NumElems = Elts.size();
4863
4864  LoadSDNode *LDBase = NULL;
4865  unsigned LastLoadedElt = -1U;
4866
4867  // For each element in the initializer, see if we've found a load or an undef.
4868  // If we don't find an initial load element, or later load elements are
4869  // non-consecutive, bail out.
4870  for (unsigned i = 0; i < NumElems; ++i) {
4871    SDValue Elt = Elts[i];
4872
4873    if (!Elt.getNode() ||
4874        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
4875      return SDValue();
4876    if (!LDBase) {
4877      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
4878        return SDValue();
4879      LDBase = cast<LoadSDNode>(Elt.getNode());
4880      LastLoadedElt = i;
4881      continue;
4882    }
4883    if (Elt.getOpcode() == ISD::UNDEF)
4884      continue;
4885
4886    LoadSDNode *LD = cast<LoadSDNode>(Elt);
4887    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
4888      return SDValue();
4889    LastLoadedElt = i;
4890  }
4891
4892  // If we have found an entire vector of loads and undefs, then return a large
4893  // load of the entire vector width starting at the base pointer.  If we found
4894  // consecutive loads for the low half, generate a vzext_load node.
4895  if (LastLoadedElt == NumElems - 1) {
4896    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
4897      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4898                         LDBase->getPointerInfo(),
4899                         LDBase->isVolatile(), LDBase->isNonTemporal(),
4900                         LDBase->isInvariant(), 0);
4901    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4902                       LDBase->getPointerInfo(),
4903                       LDBase->isVolatile(), LDBase->isNonTemporal(),
4904                       LDBase->isInvariant(), LDBase->getAlignment());
4905  }
4906  if (NumElems == 4 && LastLoadedElt == 1 &&
4907      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
4908    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
4909    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
4910    SDValue ResNode =
4911        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
4912                                LDBase->getPointerInfo(),
4913                                LDBase->getAlignment(),
4914                                false/*isVolatile*/, true/*ReadMem*/,
4915                                false/*WriteMem*/);
4916    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
4917  }
4918  return SDValue();
4919}
4920
4921/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
4922/// to generate a splat value for the following cases:
4923/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
4924/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
4925/// a scalar load, or a constant.
4926/// The VBROADCAST node is returned when a pattern is found,
4927/// or SDValue() otherwise.
4928SDValue
4929X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
4930  if (!Subtarget->hasAVX())
4931    return SDValue();
4932
4933  EVT VT = Op.getValueType();
4934  DebugLoc dl = Op.getDebugLoc();
4935
4936  assert((VT.is128BitVector() || VT.is256BitVector()) &&
4937         "Unsupported vector type for broadcast.");
4938
4939  SDValue Ld;
4940  bool ConstSplatVal;
4941
4942  switch (Op.getOpcode()) {
4943    default:
4944      // Unknown pattern found.
4945      return SDValue();
4946
4947    case ISD::BUILD_VECTOR: {
4948      // The BUILD_VECTOR node must be a splat.
4949      if (!isSplatVector(Op.getNode()))
4950        return SDValue();
4951
4952      Ld = Op.getOperand(0);
4953      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
4954                     Ld.getOpcode() == ISD::ConstantFP);
4955
4956      // The suspected load node has several users. Make sure that all
4957      // of its users are from the BUILD_VECTOR node.
4958      // Constants may have multiple users.
4959      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
4960        return SDValue();
4961      break;
4962    }
4963
4964    case ISD::VECTOR_SHUFFLE: {
4965      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4966
4967      // Shuffles must have a splat mask where the first element is
4968      // broadcasted.
4969      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
4970        return SDValue();
4971
4972      SDValue Sc = Op.getOperand(0);
4973      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
4974        return SDValue();
4975
4976      Ld = Sc.getOperand(0);
4977      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
4978                       Ld.getOpcode() == ISD::ConstantFP);
4979
4980      // The scalar_to_vector node and the suspected
4981      // load node must have exactly one user.
4982      // Constants may have multiple users.
4983      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
4984        return SDValue();
4985      break;
4986    }
4987  }
4988
4989  bool Is256 = VT.getSizeInBits() == 256;
4990
4991  // Handle the broadcasting a single constant scalar from the constant pool
4992  // into a vector. On Sandybridge it is still better to load a constant vector
4993  // from the constant pool and not to broadcast it from a scalar.
4994  if (ConstSplatVal && Subtarget->hasAVX2()) {
4995    EVT CVT = Ld.getValueType();
4996    assert(!CVT.isVector() && "Must not broadcast a vector type");
4997    unsigned ScalarSize = CVT.getSizeInBits();
4998
4999    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
5000      const Constant *C = 0;
5001      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5002        C = CI->getConstantIntValue();
5003      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5004        C = CF->getConstantFPValue();
5005
5006      assert(C && "Invalid constant type");
5007
5008      SDValue CP = DAG.getConstantPool(C, getPointerTy());
5009      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5010      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5011                       MachinePointerInfo::getConstantPool(),
5012                       false, false, false, Alignment);
5013
5014      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5015    }
5016  }
5017
5018  // The scalar source must be a normal load.
5019  if (!ISD::isNormalLoad(Ld.getNode()))
5020    return SDValue();
5021
5022  // Reject loads that have uses of the chain result
5023  if (Ld->hasAnyUseOfValue(1))
5024    return SDValue();
5025
5026  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5027
5028  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
5029    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5030
5031  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5032  // double since there is no vbroadcastsd xmm
5033  if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
5034    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5035      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5036  }
5037
5038  // Unsupported broadcast.
5039  return SDValue();
5040}
5041
5042SDValue
5043X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5044  DebugLoc dl = Op.getDebugLoc();
5045
5046  EVT VT = Op.getValueType();
5047  EVT ExtVT = VT.getVectorElementType();
5048  unsigned NumElems = Op.getNumOperands();
5049
5050  // Vectors containing all zeros can be matched by pxor and xorps later
5051  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5052    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5053    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5054    if (VT == MVT::v4i32 || VT == MVT::v8i32)
5055      return Op;
5056
5057    return getZeroVector(VT, Subtarget, DAG, dl);
5058  }
5059
5060  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5061  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5062  // vpcmpeqd on 256-bit vectors.
5063  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5064    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
5065      return Op;
5066
5067    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
5068  }
5069
5070  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
5071  if (Broadcast.getNode())
5072    return Broadcast;
5073
5074  unsigned EVTBits = ExtVT.getSizeInBits();
5075
5076  unsigned NumZero  = 0;
5077  unsigned NumNonZero = 0;
5078  unsigned NonZeros = 0;
5079  bool IsAllConstants = true;
5080  SmallSet<SDValue, 8> Values;
5081  for (unsigned i = 0; i < NumElems; ++i) {
5082    SDValue Elt = Op.getOperand(i);
5083    if (Elt.getOpcode() == ISD::UNDEF)
5084      continue;
5085    Values.insert(Elt);
5086    if (Elt.getOpcode() != ISD::Constant &&
5087        Elt.getOpcode() != ISD::ConstantFP)
5088      IsAllConstants = false;
5089    if (X86::isZeroNode(Elt))
5090      NumZero++;
5091    else {
5092      NonZeros |= (1 << i);
5093      NumNonZero++;
5094    }
5095  }
5096
5097  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5098  if (NumNonZero == 0)
5099    return DAG.getUNDEF(VT);
5100
5101  // Special case for single non-zero, non-undef, element.
5102  if (NumNonZero == 1) {
5103    unsigned Idx = CountTrailingZeros_32(NonZeros);
5104    SDValue Item = Op.getOperand(Idx);
5105
5106    // If this is an insertion of an i64 value on x86-32, and if the top bits of
5107    // the value are obviously zero, truncate the value to i32 and do the
5108    // insertion that way.  Only do this if the value is non-constant or if the
5109    // value is a constant being inserted into element 0.  It is cheaper to do
5110    // a constant pool load than it is to do a movd + shuffle.
5111    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5112        (!IsAllConstants || Idx == 0)) {
5113      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5114        // Handle SSE only.
5115        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5116        EVT VecVT = MVT::v4i32;
5117        unsigned VecElts = 4;
5118
5119        // Truncate the value (which may itself be a constant) to i32, and
5120        // convert it to a vector with movd (S2V+shuffle to zero extend).
5121        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5122        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5123        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5124
5125        // Now we have our 32-bit value zero extended in the low element of
5126        // a vector.  If Idx != 0, swizzle it into place.
5127        if (Idx != 0) {
5128          SmallVector<int, 4> Mask;
5129          Mask.push_back(Idx);
5130          for (unsigned i = 1; i != VecElts; ++i)
5131            Mask.push_back(i);
5132          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5133                                      &Mask[0]);
5134        }
5135        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5136      }
5137    }
5138
5139    // If we have a constant or non-constant insertion into the low element of
5140    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5141    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5142    // depending on what the source datatype is.
5143    if (Idx == 0) {
5144      if (NumZero == 0)
5145        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5146
5147      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5148          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5149        if (VT.getSizeInBits() == 256) {
5150          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5151          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5152                             Item, DAG.getIntPtrConstant(0));
5153        }
5154        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
5155        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5156        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5157        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5158      }
5159
5160      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5161        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5162        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5163        if (VT.getSizeInBits() == 256) {
5164          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5165          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5166        } else {
5167          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
5168          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5169        }
5170        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5171      }
5172    }
5173
5174    // Is it a vector logical left shift?
5175    if (NumElems == 2 && Idx == 1 &&
5176        X86::isZeroNode(Op.getOperand(0)) &&
5177        !X86::isZeroNode(Op.getOperand(1))) {
5178      unsigned NumBits = VT.getSizeInBits();
5179      return getVShift(true, VT,
5180                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5181                                   VT, Op.getOperand(1)),
5182                       NumBits/2, DAG, *this, dl);
5183    }
5184
5185    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5186      return SDValue();
5187
5188    // Otherwise, if this is a vector with i32 or f32 elements, and the element
5189    // is a non-constant being inserted into an element other than the low one,
5190    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5191    // movd/movss) to move this into the low element, then shuffle it into
5192    // place.
5193    if (EVTBits == 32) {
5194      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5195
5196      // Turn it into a shuffle of zero and zero-extended scalar to vector.
5197      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5198      SmallVector<int, 8> MaskVec;
5199      for (unsigned i = 0; i < NumElems; i++)
5200        MaskVec.push_back(i == Idx ? 0 : 1);
5201      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5202    }
5203  }
5204
5205  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5206  if (Values.size() == 1) {
5207    if (EVTBits == 32) {
5208      // Instead of a shuffle like this:
5209      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5210      // Check if it's possible to issue this instead.
5211      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5212      unsigned Idx = CountTrailingZeros_32(NonZeros);
5213      SDValue Item = Op.getOperand(Idx);
5214      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5215        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5216    }
5217    return SDValue();
5218  }
5219
5220  // A vector full of immediates; various special cases are already
5221  // handled, so this is best done with a single constant-pool load.
5222  if (IsAllConstants)
5223    return SDValue();
5224
5225  // For AVX-length vectors, build the individual 128-bit pieces and use
5226  // shuffles to put them in place.
5227  if (VT.getSizeInBits() == 256) {
5228    SmallVector<SDValue, 32> V;
5229    for (unsigned i = 0; i != NumElems; ++i)
5230      V.push_back(Op.getOperand(i));
5231
5232    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5233
5234    // Build both the lower and upper subvector.
5235    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
5236    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
5237                                NumElems/2);
5238
5239    // Recreate the wider vector with the lower and upper part.
5240    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5241  }
5242
5243  // Let legalizer expand 2-wide build_vectors.
5244  if (EVTBits == 64) {
5245    if (NumNonZero == 1) {
5246      // One half is zero or undef.
5247      unsigned Idx = CountTrailingZeros_32(NonZeros);
5248      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
5249                                 Op.getOperand(Idx));
5250      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
5251    }
5252    return SDValue();
5253  }
5254
5255  // If element VT is < 32 bits, convert it to inserts into a zero vector.
5256  if (EVTBits == 8 && NumElems == 16) {
5257    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
5258                                        Subtarget, *this);
5259    if (V.getNode()) return V;
5260  }
5261
5262  if (EVTBits == 16 && NumElems == 8) {
5263    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
5264                                      Subtarget, *this);
5265    if (V.getNode()) return V;
5266  }
5267
5268  // If element VT is == 32 bits, turn it into a number of shuffles.
5269  SmallVector<SDValue, 8> V(NumElems);
5270  if (NumElems == 4 && NumZero > 0) {
5271    for (unsigned i = 0; i < 4; ++i) {
5272      bool isZero = !(NonZeros & (1 << i));
5273      if (isZero)
5274        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
5275      else
5276        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5277    }
5278
5279    for (unsigned i = 0; i < 2; ++i) {
5280      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5281        default: break;
5282        case 0:
5283          V[i] = V[i*2];  // Must be a zero vector.
5284          break;
5285        case 1:
5286          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5287          break;
5288        case 2:
5289          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5290          break;
5291        case 3:
5292          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5293          break;
5294      }
5295    }
5296
5297    bool Reverse1 = (NonZeros & 0x3) == 2;
5298    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5299    int MaskVec[] = {
5300      Reverse1 ? 1 : 0,
5301      Reverse1 ? 0 : 1,
5302      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
5303      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
5304    };
5305    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
5306  }
5307
5308  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
5309    // Check for a build vector of consecutive loads.
5310    for (unsigned i = 0; i < NumElems; ++i)
5311      V[i] = Op.getOperand(i);
5312
5313    // Check for elements which are consecutive loads.
5314    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
5315    if (LD.getNode())
5316      return LD;
5317
5318    // For SSE 4.1, use insertps to put the high elements into the low element.
5319    if (getSubtarget()->hasSSE41()) {
5320      SDValue Result;
5321      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
5322        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
5323      else
5324        Result = DAG.getUNDEF(VT);
5325
5326      for (unsigned i = 1; i < NumElems; ++i) {
5327        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
5328        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
5329                             Op.getOperand(i), DAG.getIntPtrConstant(i));
5330      }
5331      return Result;
5332    }
5333
5334    // Otherwise, expand into a number of unpckl*, start by extending each of
5335    // our (non-undef) elements to the full vector width with the element in the
5336    // bottom slot of the vector (which generates no code for SSE).
5337    for (unsigned i = 0; i < NumElems; ++i) {
5338      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
5339        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5340      else
5341        V[i] = DAG.getUNDEF(VT);
5342    }
5343
5344    // Next, we iteratively mix elements, e.g. for v4f32:
5345    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
5346    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
5347    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
5348    unsigned EltStride = NumElems >> 1;
5349    while (EltStride != 0) {
5350      for (unsigned i = 0; i < EltStride; ++i) {
5351        // If V[i+EltStride] is undef and this is the first round of mixing,
5352        // then it is safe to just drop this shuffle: V[i] is already in the
5353        // right place, the one element (since it's the first round) being
5354        // inserted as undef can be dropped.  This isn't safe for successive
5355        // rounds because they will permute elements within both vectors.
5356        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
5357            EltStride == NumElems/2)
5358          continue;
5359
5360        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
5361      }
5362      EltStride >>= 1;
5363    }
5364    return V[0];
5365  }
5366  return SDValue();
5367}
5368
5369// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
5370// them in a MMX register.  This is better than doing a stack convert.
5371static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5372  DebugLoc dl = Op.getDebugLoc();
5373  EVT ResVT = Op.getValueType();
5374
5375  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
5376         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
5377  int Mask[2];
5378  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
5379  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
5380  InVec = Op.getOperand(1);
5381  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5382    unsigned NumElts = ResVT.getVectorNumElements();
5383    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
5384    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
5385                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
5386  } else {
5387    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
5388    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
5389    Mask[0] = 0; Mask[1] = 2;
5390    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
5391  }
5392  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
5393}
5394
5395// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
5396// to create 256-bit vectors from two other 128-bit ones.
5397static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5398  DebugLoc dl = Op.getDebugLoc();
5399  EVT ResVT = Op.getValueType();
5400
5401  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
5402
5403  SDValue V1 = Op.getOperand(0);
5404  SDValue V2 = Op.getOperand(1);
5405  unsigned NumElems = ResVT.getVectorNumElements();
5406
5407  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
5408}
5409
5410SDValue
5411X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
5412  EVT ResVT = Op.getValueType();
5413
5414  assert(Op.getNumOperands() == 2);
5415  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
5416         "Unsupported CONCAT_VECTORS for value type");
5417
5418  // We support concatenate two MMX registers and place them in a MMX register.
5419  // This is better than doing a stack convert.
5420  if (ResVT.is128BitVector())
5421    return LowerMMXCONCAT_VECTORS(Op, DAG);
5422
5423  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
5424  // from two other 128-bit ones.
5425  return LowerAVXCONCAT_VECTORS(Op, DAG);
5426}
5427
5428// Try to lower a shuffle node into a simple blend instruction.
5429static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
5430                                          const X86Subtarget *Subtarget,
5431                                          SelectionDAG &DAG) {
5432  SDValue V1 = SVOp->getOperand(0);
5433  SDValue V2 = SVOp->getOperand(1);
5434  DebugLoc dl = SVOp->getDebugLoc();
5435  MVT VT = SVOp->getValueType(0).getSimpleVT();
5436  unsigned NumElems = VT.getVectorNumElements();
5437
5438  if (!Subtarget->hasSSE41())
5439    return SDValue();
5440
5441  unsigned ISDNo = 0;
5442  MVT OpTy;
5443
5444  switch (VT.SimpleTy) {
5445  default: return SDValue();
5446  case MVT::v8i16:
5447    ISDNo = X86ISD::BLENDPW;
5448    OpTy = MVT::v8i16;
5449    break;
5450  case MVT::v4i32:
5451  case MVT::v4f32:
5452    ISDNo = X86ISD::BLENDPS;
5453    OpTy = MVT::v4f32;
5454    break;
5455  case MVT::v2i64:
5456  case MVT::v2f64:
5457    ISDNo = X86ISD::BLENDPD;
5458    OpTy = MVT::v2f64;
5459    break;
5460  case MVT::v8i32:
5461  case MVT::v8f32:
5462    if (!Subtarget->hasAVX())
5463      return SDValue();
5464    ISDNo = X86ISD::BLENDPS;
5465    OpTy = MVT::v8f32;
5466    break;
5467  case MVT::v4i64:
5468  case MVT::v4f64:
5469    if (!Subtarget->hasAVX())
5470      return SDValue();
5471    ISDNo = X86ISD::BLENDPD;
5472    OpTy = MVT::v4f64;
5473    break;
5474  }
5475  assert(ISDNo && "Invalid Op Number");
5476
5477  unsigned MaskVals = 0;
5478
5479  for (unsigned i = 0; i != NumElems; ++i) {
5480    int EltIdx = SVOp->getMaskElt(i);
5481    if (EltIdx == (int)i || EltIdx < 0)
5482      MaskVals |= (1<<i);
5483    else if (EltIdx == (int)(i + NumElems))
5484      continue; // Bit is set to zero;
5485    else
5486      return SDValue();
5487  }
5488
5489  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
5490  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
5491  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
5492                             DAG.getConstant(MaskVals, MVT::i32));
5493  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
5494}
5495
5496// v8i16 shuffles - Prefer shuffles in the following order:
5497// 1. [all]   pshuflw, pshufhw, optional move
5498// 2. [ssse3] 1 x pshufb
5499// 3. [ssse3] 2 x pshufb + 1 x por
5500// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
5501SDValue
5502X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
5503                                            SelectionDAG &DAG) const {
5504  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5505  SDValue V1 = SVOp->getOperand(0);
5506  SDValue V2 = SVOp->getOperand(1);
5507  DebugLoc dl = SVOp->getDebugLoc();
5508  SmallVector<int, 8> MaskVals;
5509
5510  // Determine if more than 1 of the words in each of the low and high quadwords
5511  // of the result come from the same quadword of one of the two inputs.  Undef
5512  // mask values count as coming from any quadword, for better codegen.
5513  unsigned LoQuad[] = { 0, 0, 0, 0 };
5514  unsigned HiQuad[] = { 0, 0, 0, 0 };
5515  std::bitset<4> InputQuads;
5516  for (unsigned i = 0; i < 8; ++i) {
5517    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
5518    int EltIdx = SVOp->getMaskElt(i);
5519    MaskVals.push_back(EltIdx);
5520    if (EltIdx < 0) {
5521      ++Quad[0];
5522      ++Quad[1];
5523      ++Quad[2];
5524      ++Quad[3];
5525      continue;
5526    }
5527    ++Quad[EltIdx / 4];
5528    InputQuads.set(EltIdx / 4);
5529  }
5530
5531  int BestLoQuad = -1;
5532  unsigned MaxQuad = 1;
5533  for (unsigned i = 0; i < 4; ++i) {
5534    if (LoQuad[i] > MaxQuad) {
5535      BestLoQuad = i;
5536      MaxQuad = LoQuad[i];
5537    }
5538  }
5539
5540  int BestHiQuad = -1;
5541  MaxQuad = 1;
5542  for (unsigned i = 0; i < 4; ++i) {
5543    if (HiQuad[i] > MaxQuad) {
5544      BestHiQuad = i;
5545      MaxQuad = HiQuad[i];
5546    }
5547  }
5548
5549  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
5550  // of the two input vectors, shuffle them into one input vector so only a
5551  // single pshufb instruction is necessary. If There are more than 2 input
5552  // quads, disable the next transformation since it does not help SSSE3.
5553  bool V1Used = InputQuads[0] || InputQuads[1];
5554  bool V2Used = InputQuads[2] || InputQuads[3];
5555  if (Subtarget->hasSSSE3()) {
5556    if (InputQuads.count() == 2 && V1Used && V2Used) {
5557      BestLoQuad = InputQuads[0] ? 0 : 1;
5558      BestHiQuad = InputQuads[2] ? 2 : 3;
5559    }
5560    if (InputQuads.count() > 2) {
5561      BestLoQuad = -1;
5562      BestHiQuad = -1;
5563    }
5564  }
5565
5566  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
5567  // the shuffle mask.  If a quad is scored as -1, that means that it contains
5568  // words from all 4 input quadwords.
5569  SDValue NewV;
5570  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
5571    int MaskV[] = {
5572      BestLoQuad < 0 ? 0 : BestLoQuad,
5573      BestHiQuad < 0 ? 1 : BestHiQuad
5574    };
5575    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
5576                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
5577                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
5578    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
5579
5580    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
5581    // source words for the shuffle, to aid later transformations.
5582    bool AllWordsInNewV = true;
5583    bool InOrder[2] = { true, true };
5584    for (unsigned i = 0; i != 8; ++i) {
5585      int idx = MaskVals[i];
5586      if (idx != (int)i)
5587        InOrder[i/4] = false;
5588      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
5589        continue;
5590      AllWordsInNewV = false;
5591      break;
5592    }
5593
5594    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
5595    if (AllWordsInNewV) {
5596      for (int i = 0; i != 8; ++i) {
5597        int idx = MaskVals[i];
5598        if (idx < 0)
5599          continue;
5600        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
5601        if ((idx != i) && idx < 4)
5602          pshufhw = false;
5603        if ((idx != i) && idx > 3)
5604          pshuflw = false;
5605      }
5606      V1 = NewV;
5607      V2Used = false;
5608      BestLoQuad = 0;
5609      BestHiQuad = 1;
5610    }
5611
5612    // If we've eliminated the use of V2, and the new mask is a pshuflw or
5613    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
5614    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
5615      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
5616      unsigned TargetMask = 0;
5617      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
5618                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
5619      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5620      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
5621                             getShufflePSHUFLWImmediate(SVOp);
5622      V1 = NewV.getOperand(0);
5623      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
5624    }
5625  }
5626
5627  // If we have SSSE3, and all words of the result are from 1 input vector,
5628  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
5629  // is present, fall back to case 4.
5630  if (Subtarget->hasSSSE3()) {
5631    SmallVector<SDValue,16> pshufbMask;
5632
5633    // If we have elements from both input vectors, set the high bit of the
5634    // shuffle mask element to zero out elements that come from V2 in the V1
5635    // mask, and elements that come from V1 in the V2 mask, so that the two
5636    // results can be OR'd together.
5637    bool TwoInputs = V1Used && V2Used;
5638    for (unsigned i = 0; i != 8; ++i) {
5639      int EltIdx = MaskVals[i] * 2;
5640      if (TwoInputs && (EltIdx >= 16)) {
5641        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5642        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5643        continue;
5644      }
5645      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
5646      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
5647    }
5648    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
5649    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5650                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5651                                 MVT::v16i8, &pshufbMask[0], 16));
5652    if (!TwoInputs)
5653      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5654
5655    // Calculate the shuffle mask for the second input, shuffle it, and
5656    // OR it with the first shuffled input.
5657    pshufbMask.clear();
5658    for (unsigned i = 0; i != 8; ++i) {
5659      int EltIdx = MaskVals[i] * 2;
5660      if (EltIdx < 16) {
5661        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5662        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5663        continue;
5664      }
5665      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
5666      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
5667    }
5668    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
5669    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5670                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5671                                 MVT::v16i8, &pshufbMask[0], 16));
5672    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5673    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5674  }
5675
5676  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
5677  // and update MaskVals with new element order.
5678  std::bitset<8> InOrder;
5679  if (BestLoQuad >= 0) {
5680    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
5681    for (int i = 0; i != 4; ++i) {
5682      int idx = MaskVals[i];
5683      if (idx < 0) {
5684        InOrder.set(i);
5685      } else if ((idx / 4) == BestLoQuad) {
5686        MaskV[i] = idx & 3;
5687        InOrder.set(i);
5688      }
5689    }
5690    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5691                                &MaskV[0]);
5692
5693    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5694      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5695      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
5696                                  NewV.getOperand(0),
5697                                  getShufflePSHUFLWImmediate(SVOp), DAG);
5698    }
5699  }
5700
5701  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
5702  // and update MaskVals with the new element order.
5703  if (BestHiQuad >= 0) {
5704    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
5705    for (unsigned i = 4; i != 8; ++i) {
5706      int idx = MaskVals[i];
5707      if (idx < 0) {
5708        InOrder.set(i);
5709      } else if ((idx / 4) == BestHiQuad) {
5710        MaskV[i] = (idx & 3) + 4;
5711        InOrder.set(i);
5712      }
5713    }
5714    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5715                                &MaskV[0]);
5716
5717    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5718      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5719      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
5720                                  NewV.getOperand(0),
5721                                  getShufflePSHUFHWImmediate(SVOp), DAG);
5722    }
5723  }
5724
5725  // In case BestHi & BestLo were both -1, which means each quadword has a word
5726  // from each of the four input quadwords, calculate the InOrder bitvector now
5727  // before falling through to the insert/extract cleanup.
5728  if (BestLoQuad == -1 && BestHiQuad == -1) {
5729    NewV = V1;
5730    for (int i = 0; i != 8; ++i)
5731      if (MaskVals[i] < 0 || MaskVals[i] == i)
5732        InOrder.set(i);
5733  }
5734
5735  // The other elements are put in the right place using pextrw and pinsrw.
5736  for (unsigned i = 0; i != 8; ++i) {
5737    if (InOrder[i])
5738      continue;
5739    int EltIdx = MaskVals[i];
5740    if (EltIdx < 0)
5741      continue;
5742    SDValue ExtOp = (EltIdx < 8) ?
5743      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
5744                  DAG.getIntPtrConstant(EltIdx)) :
5745      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
5746                  DAG.getIntPtrConstant(EltIdx - 8));
5747    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
5748                       DAG.getIntPtrConstant(i));
5749  }
5750  return NewV;
5751}
5752
5753// v16i8 shuffles - Prefer shuffles in the following order:
5754// 1. [ssse3] 1 x pshufb
5755// 2. [ssse3] 2 x pshufb + 1 x por
5756// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
5757static
5758SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
5759                                 SelectionDAG &DAG,
5760                                 const X86TargetLowering &TLI) {
5761  SDValue V1 = SVOp->getOperand(0);
5762  SDValue V2 = SVOp->getOperand(1);
5763  DebugLoc dl = SVOp->getDebugLoc();
5764  ArrayRef<int> MaskVals = SVOp->getMask();
5765
5766  // If we have SSSE3, case 1 is generated when all result bytes come from
5767  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
5768  // present, fall back to case 3.
5769  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
5770  bool V1Only = true;
5771  bool V2Only = true;
5772  for (unsigned i = 0; i < 16; ++i) {
5773    int EltIdx = MaskVals[i];
5774    if (EltIdx < 0)
5775      continue;
5776    if (EltIdx < 16)
5777      V2Only = false;
5778    else
5779      V1Only = false;
5780  }
5781
5782  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
5783  if (TLI.getSubtarget()->hasSSSE3()) {
5784    SmallVector<SDValue,16> pshufbMask;
5785
5786    // If all result elements are from one input vector, then only translate
5787    // undef mask values to 0x80 (zero out result) in the pshufb mask.
5788    //
5789    // Otherwise, we have elements from both input vectors, and must zero out
5790    // elements that come from V2 in the first mask, and V1 in the second mask
5791    // so that we can OR them together.
5792    bool TwoInputs = !(V1Only || V2Only);
5793    for (unsigned i = 0; i != 16; ++i) {
5794      int EltIdx = MaskVals[i];
5795      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
5796        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5797        continue;
5798      }
5799      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
5800    }
5801    // If all the elements are from V2, assign it to V1 and return after
5802    // building the first pshufb.
5803    if (V2Only)
5804      V1 = V2;
5805    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5806                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5807                                 MVT::v16i8, &pshufbMask[0], 16));
5808    if (!TwoInputs)
5809      return V1;
5810
5811    // Calculate the shuffle mask for the second input, shuffle it, and
5812    // OR it with the first shuffled input.
5813    pshufbMask.clear();
5814    for (unsigned i = 0; i != 16; ++i) {
5815      int EltIdx = MaskVals[i];
5816      if (EltIdx < 16) {
5817        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5818        continue;
5819      }
5820      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
5821    }
5822    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5823                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5824                                 MVT::v16i8, &pshufbMask[0], 16));
5825    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5826  }
5827
5828  // No SSSE3 - Calculate in place words and then fix all out of place words
5829  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
5830  // the 16 different words that comprise the two doublequadword input vectors.
5831  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5832  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
5833  SDValue NewV = V2Only ? V2 : V1;
5834  for (int i = 0; i != 8; ++i) {
5835    int Elt0 = MaskVals[i*2];
5836    int Elt1 = MaskVals[i*2+1];
5837
5838    // This word of the result is all undef, skip it.
5839    if (Elt0 < 0 && Elt1 < 0)
5840      continue;
5841
5842    // This word of the result is already in the correct place, skip it.
5843    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
5844      continue;
5845    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
5846      continue;
5847
5848    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
5849    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
5850    SDValue InsElt;
5851
5852    // If Elt0 and Elt1 are defined, are consecutive, and can be load
5853    // using a single extract together, load it and store it.
5854    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
5855      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
5856                           DAG.getIntPtrConstant(Elt1 / 2));
5857      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
5858                        DAG.getIntPtrConstant(i));
5859      continue;
5860    }
5861
5862    // If Elt1 is defined, extract it from the appropriate source.  If the
5863    // source byte is not also odd, shift the extracted word left 8 bits
5864    // otherwise clear the bottom 8 bits if we need to do an or.
5865    if (Elt1 >= 0) {
5866      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
5867                           DAG.getIntPtrConstant(Elt1 / 2));
5868      if ((Elt1 & 1) == 0)
5869        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
5870                             DAG.getConstant(8,
5871                                  TLI.getShiftAmountTy(InsElt.getValueType())));
5872      else if (Elt0 >= 0)
5873        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
5874                             DAG.getConstant(0xFF00, MVT::i16));
5875    }
5876    // If Elt0 is defined, extract it from the appropriate source.  If the
5877    // source byte is not also even, shift the extracted word right 8 bits. If
5878    // Elt1 was also defined, OR the extracted values together before
5879    // inserting them in the result.
5880    if (Elt0 >= 0) {
5881      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
5882                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
5883      if ((Elt0 & 1) != 0)
5884        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
5885                              DAG.getConstant(8,
5886                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
5887      else if (Elt1 >= 0)
5888        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
5889                             DAG.getConstant(0x00FF, MVT::i16));
5890      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
5891                         : InsElt0;
5892    }
5893    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
5894                       DAG.getIntPtrConstant(i));
5895  }
5896  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
5897}
5898
5899/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
5900/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
5901/// done when every pair / quad of shuffle mask elements point to elements in
5902/// the right sequence. e.g.
5903/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
5904static
5905SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
5906                                 SelectionDAG &DAG, DebugLoc dl) {
5907  MVT VT = SVOp->getValueType(0).getSimpleVT();
5908  unsigned NumElems = VT.getVectorNumElements();
5909  MVT NewVT;
5910  unsigned Scale;
5911  switch (VT.SimpleTy) {
5912  default: llvm_unreachable("Unexpected!");
5913  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
5914  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
5915  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
5916  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
5917  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
5918  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
5919  }
5920
5921  SmallVector<int, 8> MaskVec;
5922  for (unsigned i = 0; i != NumElems; i += Scale) {
5923    int StartIdx = -1;
5924    for (unsigned j = 0; j != Scale; ++j) {
5925      int EltIdx = SVOp->getMaskElt(i+j);
5926      if (EltIdx < 0)
5927        continue;
5928      if (StartIdx < 0)
5929        StartIdx = (EltIdx / Scale);
5930      if (EltIdx != (int)(StartIdx*Scale + j))
5931        return SDValue();
5932    }
5933    MaskVec.push_back(StartIdx);
5934  }
5935
5936  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
5937  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
5938  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
5939}
5940
5941/// getVZextMovL - Return a zero-extending vector move low node.
5942///
5943static SDValue getVZextMovL(EVT VT, EVT OpVT,
5944                            SDValue SrcOp, SelectionDAG &DAG,
5945                            const X86Subtarget *Subtarget, DebugLoc dl) {
5946  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
5947    LoadSDNode *LD = NULL;
5948    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
5949      LD = dyn_cast<LoadSDNode>(SrcOp);
5950    if (!LD) {
5951      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
5952      // instead.
5953      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
5954      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
5955          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5956          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
5957          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
5958        // PR2108
5959        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
5960        return DAG.getNode(ISD::BITCAST, dl, VT,
5961                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
5962                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5963                                                   OpVT,
5964                                                   SrcOp.getOperand(0)
5965                                                          .getOperand(0))));
5966      }
5967    }
5968  }
5969
5970  return DAG.getNode(ISD::BITCAST, dl, VT,
5971                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
5972                                 DAG.getNode(ISD::BITCAST, dl,
5973                                             OpVT, SrcOp)));
5974}
5975
5976/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
5977/// which could not be matched by any known target speficic shuffle
5978static SDValue
5979LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
5980  EVT VT = SVOp->getValueType(0);
5981
5982  unsigned NumElems = VT.getVectorNumElements();
5983  unsigned NumLaneElems = NumElems / 2;
5984
5985  DebugLoc dl = SVOp->getDebugLoc();
5986  MVT EltVT = VT.getVectorElementType().getSimpleVT();
5987  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
5988  SDValue Shufs[2];
5989
5990  SmallVector<int, 16> Mask;
5991  for (unsigned l = 0; l < 2; ++l) {
5992    // Build a shuffle mask for the output, discovering on the fly which
5993    // input vectors to use as shuffle operands (recorded in InputUsed).
5994    // If building a suitable shuffle vector proves too hard, then bail
5995    // out with useBuildVector set.
5996    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
5997    unsigned LaneStart = l * NumLaneElems;
5998    for (unsigned i = 0; i != NumLaneElems; ++i) {
5999      // The mask element.  This indexes into the input.
6000      int Idx = SVOp->getMaskElt(i+LaneStart);
6001      if (Idx < 0) {
6002        // the mask element does not index into any input vector.
6003        Mask.push_back(-1);
6004        continue;
6005      }
6006
6007      // The input vector this mask element indexes into.
6008      int Input = Idx / NumLaneElems;
6009
6010      // Turn the index into an offset from the start of the input vector.
6011      Idx -= Input * NumLaneElems;
6012
6013      // Find or create a shuffle vector operand to hold this input.
6014      unsigned OpNo;
6015      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6016        if (InputUsed[OpNo] == Input)
6017          // This input vector is already an operand.
6018          break;
6019        if (InputUsed[OpNo] < 0) {
6020          // Create a new operand for this input vector.
6021          InputUsed[OpNo] = Input;
6022          break;
6023        }
6024      }
6025
6026      if (OpNo >= array_lengthof(InputUsed)) {
6027        // More than two input vectors used! Give up.
6028        return SDValue();
6029      }
6030
6031      // Add the mask index for the new shuffle vector.
6032      Mask.push_back(Idx + OpNo * NumLaneElems);
6033    }
6034
6035    if (InputUsed[0] < 0) {
6036      // No input vectors were used! The result is undefined.
6037      Shufs[l] = DAG.getUNDEF(NVT);
6038    } else {
6039      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6040                                        (InputUsed[0] % 2) * NumLaneElems,
6041                                        DAG, dl);
6042      // If only one input was used, use an undefined vector for the other.
6043      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6044        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6045                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6046      // At least one input vector was used. Create a new shuffle vector.
6047      Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6048    }
6049
6050    Mask.clear();
6051  }
6052
6053  // Concatenate the result back
6054  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Shufs[0], Shufs[1]);
6055}
6056
6057/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6058/// 4 elements, and match them with several different shuffle types.
6059static SDValue
6060LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6061  SDValue V1 = SVOp->getOperand(0);
6062  SDValue V2 = SVOp->getOperand(1);
6063  DebugLoc dl = SVOp->getDebugLoc();
6064  EVT VT = SVOp->getValueType(0);
6065
6066  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
6067
6068  std::pair<int, int> Locs[4];
6069  int Mask1[] = { -1, -1, -1, -1 };
6070  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6071
6072  unsigned NumHi = 0;
6073  unsigned NumLo = 0;
6074  for (unsigned i = 0; i != 4; ++i) {
6075    int Idx = PermMask[i];
6076    if (Idx < 0) {
6077      Locs[i] = std::make_pair(-1, -1);
6078    } else {
6079      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6080      if (Idx < 4) {
6081        Locs[i] = std::make_pair(0, NumLo);
6082        Mask1[NumLo] = Idx;
6083        NumLo++;
6084      } else {
6085        Locs[i] = std::make_pair(1, NumHi);
6086        if (2+NumHi < 4)
6087          Mask1[2+NumHi] = Idx;
6088        NumHi++;
6089      }
6090    }
6091  }
6092
6093  if (NumLo <= 2 && NumHi <= 2) {
6094    // If no more than two elements come from either vector. This can be
6095    // implemented with two shuffles. First shuffle gather the elements.
6096    // The second shuffle, which takes the first shuffle as both of its
6097    // vector operands, put the elements into the right order.
6098    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6099
6100    int Mask2[] = { -1, -1, -1, -1 };
6101
6102    for (unsigned i = 0; i != 4; ++i)
6103      if (Locs[i].first != -1) {
6104        unsigned Idx = (i < 2) ? 0 : 4;
6105        Idx += Locs[i].first * 2 + Locs[i].second;
6106        Mask2[i] = Idx;
6107      }
6108
6109    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6110  }
6111
6112  if (NumLo == 3 || NumHi == 3) {
6113    // Otherwise, we must have three elements from one vector, call it X, and
6114    // one element from the other, call it Y.  First, use a shufps to build an
6115    // intermediate vector with the one element from Y and the element from X
6116    // that will be in the same half in the final destination (the indexes don't
6117    // matter). Then, use a shufps to build the final vector, taking the half
6118    // containing the element from Y from the intermediate, and the other half
6119    // from X.
6120    if (NumHi == 3) {
6121      // Normalize it so the 3 elements come from V1.
6122      CommuteVectorShuffleMask(PermMask, 4);
6123      std::swap(V1, V2);
6124    }
6125
6126    // Find the element from V2.
6127    unsigned HiIndex;
6128    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6129      int Val = PermMask[HiIndex];
6130      if (Val < 0)
6131        continue;
6132      if (Val >= 4)
6133        break;
6134    }
6135
6136    Mask1[0] = PermMask[HiIndex];
6137    Mask1[1] = -1;
6138    Mask1[2] = PermMask[HiIndex^1];
6139    Mask1[3] = -1;
6140    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6141
6142    if (HiIndex >= 2) {
6143      Mask1[0] = PermMask[0];
6144      Mask1[1] = PermMask[1];
6145      Mask1[2] = HiIndex & 1 ? 6 : 4;
6146      Mask1[3] = HiIndex & 1 ? 4 : 6;
6147      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6148    }
6149
6150    Mask1[0] = HiIndex & 1 ? 2 : 0;
6151    Mask1[1] = HiIndex & 1 ? 0 : 2;
6152    Mask1[2] = PermMask[2];
6153    Mask1[3] = PermMask[3];
6154    if (Mask1[2] >= 0)
6155      Mask1[2] += 4;
6156    if (Mask1[3] >= 0)
6157      Mask1[3] += 4;
6158    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6159  }
6160
6161  // Break it into (shuffle shuffle_hi, shuffle_lo).
6162  int LoMask[] = { -1, -1, -1, -1 };
6163  int HiMask[] = { -1, -1, -1, -1 };
6164
6165  int *MaskPtr = LoMask;
6166  unsigned MaskIdx = 0;
6167  unsigned LoIdx = 0;
6168  unsigned HiIdx = 2;
6169  for (unsigned i = 0; i != 4; ++i) {
6170    if (i == 2) {
6171      MaskPtr = HiMask;
6172      MaskIdx = 1;
6173      LoIdx = 0;
6174      HiIdx = 2;
6175    }
6176    int Idx = PermMask[i];
6177    if (Idx < 0) {
6178      Locs[i] = std::make_pair(-1, -1);
6179    } else if (Idx < 4) {
6180      Locs[i] = std::make_pair(MaskIdx, LoIdx);
6181      MaskPtr[LoIdx] = Idx;
6182      LoIdx++;
6183    } else {
6184      Locs[i] = std::make_pair(MaskIdx, HiIdx);
6185      MaskPtr[HiIdx] = Idx;
6186      HiIdx++;
6187    }
6188  }
6189
6190  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6191  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6192  int MaskOps[] = { -1, -1, -1, -1 };
6193  for (unsigned i = 0; i != 4; ++i)
6194    if (Locs[i].first != -1)
6195      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6196  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6197}
6198
6199static bool MayFoldVectorLoad(SDValue V) {
6200  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6201    V = V.getOperand(0);
6202  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6203    V = V.getOperand(0);
6204  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6205      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6206    // BUILD_VECTOR (load), undef
6207    V = V.getOperand(0);
6208  if (MayFoldLoad(V))
6209    return true;
6210  return false;
6211}
6212
6213// FIXME: the version above should always be used. Since there's
6214// a bug where several vector shuffles can't be folded because the
6215// DAG is not updated during lowering and a node claims to have two
6216// uses while it only has one, use this version, and let isel match
6217// another instruction if the load really happens to have more than
6218// one use. Remove this version after this bug get fixed.
6219// rdar://8434668, PR8156
6220static bool RelaxedMayFoldVectorLoad(SDValue V) {
6221  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6222    V = V.getOperand(0);
6223  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6224    V = V.getOperand(0);
6225  if (ISD::isNormalLoad(V.getNode()))
6226    return true;
6227  return false;
6228}
6229
6230static
6231SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
6232  EVT VT = Op.getValueType();
6233
6234  // Canonizalize to v2f64.
6235  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
6236  return DAG.getNode(ISD::BITCAST, dl, VT,
6237                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
6238                                          V1, DAG));
6239}
6240
6241static
6242SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
6243                        bool HasSSE2) {
6244  SDValue V1 = Op.getOperand(0);
6245  SDValue V2 = Op.getOperand(1);
6246  EVT VT = Op.getValueType();
6247
6248  assert(VT != MVT::v2i64 && "unsupported shuffle type");
6249
6250  if (HasSSE2 && VT == MVT::v2f64)
6251    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
6252
6253  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
6254  return DAG.getNode(ISD::BITCAST, dl, VT,
6255                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
6256                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
6257                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
6258}
6259
6260static
6261SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
6262  SDValue V1 = Op.getOperand(0);
6263  SDValue V2 = Op.getOperand(1);
6264  EVT VT = Op.getValueType();
6265
6266  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
6267         "unsupported shuffle type");
6268
6269  if (V2.getOpcode() == ISD::UNDEF)
6270    V2 = V1;
6271
6272  // v4i32 or v4f32
6273  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
6274}
6275
6276static
6277SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
6278  SDValue V1 = Op.getOperand(0);
6279  SDValue V2 = Op.getOperand(1);
6280  EVT VT = Op.getValueType();
6281  unsigned NumElems = VT.getVectorNumElements();
6282
6283  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
6284  // operand of these instructions is only memory, so check if there's a
6285  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
6286  // same masks.
6287  bool CanFoldLoad = false;
6288
6289  // Trivial case, when V2 comes from a load.
6290  if (MayFoldVectorLoad(V2))
6291    CanFoldLoad = true;
6292
6293  // When V1 is a load, it can be folded later into a store in isel, example:
6294  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
6295  //    turns into:
6296  //  (MOVLPSmr addr:$src1, VR128:$src2)
6297  // So, recognize this potential and also use MOVLPS or MOVLPD
6298  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
6299    CanFoldLoad = true;
6300
6301  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6302  if (CanFoldLoad) {
6303    if (HasSSE2 && NumElems == 2)
6304      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
6305
6306    if (NumElems == 4)
6307      // If we don't care about the second element, procede to use movss.
6308      if (SVOp->getMaskElt(1) != -1)
6309        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
6310  }
6311
6312  // movl and movlp will both match v2i64, but v2i64 is never matched by
6313  // movl earlier because we make it strict to avoid messing with the movlp load
6314  // folding logic (see the code above getMOVLP call). Match it here then,
6315  // this is horrible, but will stay like this until we move all shuffle
6316  // matching to x86 specific nodes. Note that for the 1st condition all
6317  // types are matched with movsd.
6318  if (HasSSE2) {
6319    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
6320    // as to remove this logic from here, as much as possible
6321    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
6322      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6323    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6324  }
6325
6326  assert(VT != MVT::v4i32 && "unsupported shuffle type");
6327
6328  // Invert the operand order and use SHUFPS to match it.
6329  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
6330                              getShuffleSHUFImmediate(SVOp), DAG);
6331}
6332
6333SDValue
6334X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
6335  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6336  EVT VT = Op.getValueType();
6337  DebugLoc dl = Op.getDebugLoc();
6338  SDValue V1 = Op.getOperand(0);
6339  SDValue V2 = Op.getOperand(1);
6340
6341  if (isZeroShuffle(SVOp))
6342    return getZeroVector(VT, Subtarget, DAG, dl);
6343
6344  // Handle splat operations
6345  if (SVOp->isSplat()) {
6346    unsigned NumElem = VT.getVectorNumElements();
6347    int Size = VT.getSizeInBits();
6348
6349    // Use vbroadcast whenever the splat comes from a foldable load
6350    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
6351    if (Broadcast.getNode())
6352      return Broadcast;
6353
6354    // Handle splats by matching through known shuffle masks
6355    if ((Size == 128 && NumElem <= 4) ||
6356        (Size == 256 && NumElem < 8))
6357      return SDValue();
6358
6359    // All remaning splats are promoted to target supported vector shuffles.
6360    return PromoteSplat(SVOp, DAG);
6361  }
6362
6363  // If the shuffle can be profitably rewritten as a narrower shuffle, then
6364  // do it!
6365  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
6366      VT == MVT::v16i16 || VT == MVT::v32i8) {
6367    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6368    if (NewOp.getNode())
6369      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
6370  } else if ((VT == MVT::v4i32 ||
6371             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
6372    // FIXME: Figure out a cleaner way to do this.
6373    // Try to make use of movq to zero out the top part.
6374    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
6375      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6376      if (NewOp.getNode()) {
6377        EVT NewVT = NewOp.getValueType();
6378        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
6379                               NewVT, true, false))
6380          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
6381                              DAG, Subtarget, dl);
6382      }
6383    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
6384      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6385      if (NewOp.getNode()) {
6386        EVT NewVT = NewOp.getValueType();
6387        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
6388          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
6389                              DAG, Subtarget, dl);
6390      }
6391    }
6392  }
6393  return SDValue();
6394}
6395
6396SDValue
6397X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
6398  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6399  SDValue V1 = Op.getOperand(0);
6400  SDValue V2 = Op.getOperand(1);
6401  EVT VT = Op.getValueType();
6402  DebugLoc dl = Op.getDebugLoc();
6403  unsigned NumElems = VT.getVectorNumElements();
6404  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
6405  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6406  bool V1IsSplat = false;
6407  bool V2IsSplat = false;
6408  bool HasSSE2 = Subtarget->hasSSE2();
6409  bool HasAVX    = Subtarget->hasAVX();
6410  bool HasAVX2   = Subtarget->hasAVX2();
6411  MachineFunction &MF = DAG.getMachineFunction();
6412  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
6413
6414  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
6415
6416  if (V1IsUndef && V2IsUndef)
6417    return DAG.getUNDEF(VT);
6418
6419  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
6420
6421  // Vector shuffle lowering takes 3 steps:
6422  //
6423  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
6424  //    narrowing and commutation of operands should be handled.
6425  // 2) Matching of shuffles with known shuffle masks to x86 target specific
6426  //    shuffle nodes.
6427  // 3) Rewriting of unmatched masks into new generic shuffle operations,
6428  //    so the shuffle can be broken into other shuffles and the legalizer can
6429  //    try the lowering again.
6430  //
6431  // The general idea is that no vector_shuffle operation should be left to
6432  // be matched during isel, all of them must be converted to a target specific
6433  // node here.
6434
6435  // Normalize the input vectors. Here splats, zeroed vectors, profitable
6436  // narrowing and commutation of operands should be handled. The actual code
6437  // doesn't include all of those, work in progress...
6438  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
6439  if (NewOp.getNode())
6440    return NewOp;
6441
6442  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
6443
6444  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
6445  // unpckh_undef). Only use pshufd if speed is more important than size.
6446  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
6447    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6448  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
6449    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6450
6451  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
6452      V2IsUndef && RelaxedMayFoldVectorLoad(V1))
6453    return getMOVDDup(Op, dl, V1, DAG);
6454
6455  if (isMOVHLPS_v_undef_Mask(M, VT))
6456    return getMOVHighToLow(Op, dl, DAG);
6457
6458  // Use to match splats
6459  if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
6460      (VT == MVT::v2f64 || VT == MVT::v2i64))
6461    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6462
6463  if (isPSHUFDMask(M, VT)) {
6464    // The actual implementation will match the mask in the if above and then
6465    // during isel it can match several different instructions, not only pshufd
6466    // as its name says, sad but true, emulate the behavior for now...
6467    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
6468      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
6469
6470    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
6471
6472    if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
6473      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
6474
6475    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
6476      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
6477
6478    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
6479                                TargetMask, DAG);
6480  }
6481
6482  // Check if this can be converted into a logical shift.
6483  bool isLeft = false;
6484  unsigned ShAmt = 0;
6485  SDValue ShVal;
6486  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
6487  if (isShift && ShVal.hasOneUse()) {
6488    // If the shifted value has multiple uses, it may be cheaper to use
6489    // v_set0 + movlhps or movhlps, etc.
6490    EVT EltVT = VT.getVectorElementType();
6491    ShAmt *= EltVT.getSizeInBits();
6492    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6493  }
6494
6495  if (isMOVLMask(M, VT)) {
6496    if (ISD::isBuildVectorAllZeros(V1.getNode()))
6497      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
6498    if (!isMOVLPMask(M, VT)) {
6499      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6500        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6501
6502      if (VT == MVT::v4i32 || VT == MVT::v4f32)
6503        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6504    }
6505  }
6506
6507  // FIXME: fold these into legal mask.
6508  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
6509    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
6510
6511  if (isMOVHLPSMask(M, VT))
6512    return getMOVHighToLow(Op, dl, DAG);
6513
6514  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
6515    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
6516
6517  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
6518    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
6519
6520  if (isMOVLPMask(M, VT))
6521    return getMOVLP(Op, dl, DAG, HasSSE2);
6522
6523  if (ShouldXformToMOVHLPS(M, VT) ||
6524      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
6525    return CommuteVectorShuffle(SVOp, DAG);
6526
6527  if (isShift) {
6528    // No better options. Use a vshldq / vsrldq.
6529    EVT EltVT = VT.getVectorElementType();
6530    ShAmt *= EltVT.getSizeInBits();
6531    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6532  }
6533
6534  bool Commuted = false;
6535  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
6536  // 1,1,1,1 -> v8i16 though.
6537  V1IsSplat = isSplatVector(V1.getNode());
6538  V2IsSplat = isSplatVector(V2.getNode());
6539
6540  // Canonicalize the splat or undef, if present, to be on the RHS.
6541  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
6542    CommuteVectorShuffleMask(M, NumElems);
6543    std::swap(V1, V2);
6544    std::swap(V1IsSplat, V2IsSplat);
6545    Commuted = true;
6546  }
6547
6548  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
6549    // Shuffling low element of v1 into undef, just return v1.
6550    if (V2IsUndef)
6551      return V1;
6552    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
6553    // the instruction selector will not match, so get a canonical MOVL with
6554    // swapped operands to undo the commute.
6555    return getMOVL(DAG, dl, VT, V2, V1);
6556  }
6557
6558  if (isUNPCKLMask(M, VT, HasAVX2))
6559    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6560
6561  if (isUNPCKHMask(M, VT, HasAVX2))
6562    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6563
6564  if (V2IsSplat) {
6565    // Normalize mask so all entries that point to V2 points to its first
6566    // element then try to match unpck{h|l} again. If match, return a
6567    // new vector_shuffle with the corrected mask.p
6568    SmallVector<int, 8> NewMask(M.begin(), M.end());
6569    NormalizeMask(NewMask, NumElems);
6570    if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
6571      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6572    if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
6573      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6574  }
6575
6576  if (Commuted) {
6577    // Commute is back and try unpck* again.
6578    // FIXME: this seems wrong.
6579    CommuteVectorShuffleMask(M, NumElems);
6580    std::swap(V1, V2);
6581    std::swap(V1IsSplat, V2IsSplat);
6582    Commuted = false;
6583
6584    if (isUNPCKLMask(M, VT, HasAVX2))
6585      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6586
6587    if (isUNPCKHMask(M, VT, HasAVX2))
6588      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6589  }
6590
6591  // Normalize the node to match x86 shuffle ops if needed
6592  if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
6593    return CommuteVectorShuffle(SVOp, DAG);
6594
6595  // The checks below are all present in isShuffleMaskLegal, but they are
6596  // inlined here right now to enable us to directly emit target specific
6597  // nodes, and remove one by one until they don't return Op anymore.
6598
6599  if (isPALIGNRMask(M, VT, Subtarget))
6600    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
6601                                getShufflePALIGNRImmediate(SVOp),
6602                                DAG);
6603
6604  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
6605      SVOp->getSplatIndex() == 0 && V2IsUndef) {
6606    if (VT == MVT::v2f64 || VT == MVT::v2i64)
6607      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6608  }
6609
6610  if (isPSHUFHWMask(M, VT, HasAVX2))
6611    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
6612                                getShufflePSHUFHWImmediate(SVOp),
6613                                DAG);
6614
6615  if (isPSHUFLWMask(M, VT, HasAVX2))
6616    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
6617                                getShufflePSHUFLWImmediate(SVOp),
6618                                DAG);
6619
6620  if (isSHUFPMask(M, VT, HasAVX))
6621    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
6622                                getShuffleSHUFImmediate(SVOp), DAG);
6623
6624  if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
6625    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6626  if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
6627    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6628
6629  //===--------------------------------------------------------------------===//
6630  // Generate target specific nodes for 128 or 256-bit shuffles only
6631  // supported in the AVX instruction set.
6632  //
6633
6634  // Handle VMOVDDUPY permutations
6635  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
6636    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
6637
6638  // Handle VPERMILPS/D* permutations
6639  if (isVPERMILPMask(M, VT, HasAVX)) {
6640    if (HasAVX2 && VT == MVT::v8i32)
6641      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
6642                                  getShuffleSHUFImmediate(SVOp), DAG);
6643    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
6644                                getShuffleSHUFImmediate(SVOp), DAG);
6645  }
6646
6647  // Handle VPERM2F128/VPERM2I128 permutations
6648  if (isVPERM2X128Mask(M, VT, HasAVX))
6649    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
6650                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
6651
6652  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
6653  if (BlendOp.getNode())
6654    return BlendOp;
6655
6656  if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
6657    SmallVector<SDValue, 8> permclMask;
6658    for (unsigned i = 0; i != 8; ++i) {
6659      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
6660    }
6661    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
6662                               &permclMask[0], 8);
6663    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
6664    return DAG.getNode(X86ISD::VPERMV, dl, VT,
6665                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
6666  }
6667
6668  if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
6669    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
6670                                getShuffleCLImmediate(SVOp), DAG);
6671
6672
6673  //===--------------------------------------------------------------------===//
6674  // Since no target specific shuffle was selected for this generic one,
6675  // lower it into other known shuffles. FIXME: this isn't true yet, but
6676  // this is the plan.
6677  //
6678
6679  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
6680  if (VT == MVT::v8i16) {
6681    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
6682    if (NewOp.getNode())
6683      return NewOp;
6684  }
6685
6686  if (VT == MVT::v16i8) {
6687    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
6688    if (NewOp.getNode())
6689      return NewOp;
6690  }
6691
6692  // Handle all 128-bit wide vectors with 4 elements, and match them with
6693  // several different shuffle types.
6694  if (NumElems == 4 && VT.getSizeInBits() == 128)
6695    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
6696
6697  // Handle general 256-bit shuffles
6698  if (VT.is256BitVector())
6699    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
6700
6701  return SDValue();
6702}
6703
6704SDValue
6705X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
6706                                                SelectionDAG &DAG) const {
6707  EVT VT = Op.getValueType();
6708  DebugLoc dl = Op.getDebugLoc();
6709
6710  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
6711    return SDValue();
6712
6713  if (VT.getSizeInBits() == 8) {
6714    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
6715                                    Op.getOperand(0), Op.getOperand(1));
6716    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
6717                                    DAG.getValueType(VT));
6718    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6719  }
6720
6721  if (VT.getSizeInBits() == 16) {
6722    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6723    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
6724    if (Idx == 0)
6725      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
6726                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6727                                     DAG.getNode(ISD::BITCAST, dl,
6728                                                 MVT::v4i32,
6729                                                 Op.getOperand(0)),
6730                                     Op.getOperand(1)));
6731    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
6732                                    Op.getOperand(0), Op.getOperand(1));
6733    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
6734                                    DAG.getValueType(VT));
6735    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6736  }
6737
6738  if (VT == MVT::f32) {
6739    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
6740    // the result back to FR32 register. It's only worth matching if the
6741    // result has a single use which is a store or a bitcast to i32.  And in
6742    // the case of a store, it's not worth it if the index is a constant 0,
6743    // because a MOVSSmr can be used instead, which is smaller and faster.
6744    if (!Op.hasOneUse())
6745      return SDValue();
6746    SDNode *User = *Op.getNode()->use_begin();
6747    if ((User->getOpcode() != ISD::STORE ||
6748         (isa<ConstantSDNode>(Op.getOperand(1)) &&
6749          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
6750        (User->getOpcode() != ISD::BITCAST ||
6751         User->getValueType(0) != MVT::i32))
6752      return SDValue();
6753    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6754                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
6755                                              Op.getOperand(0)),
6756                                              Op.getOperand(1));
6757    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
6758  }
6759
6760  if (VT == MVT::i32 || VT == MVT::i64) {
6761    // ExtractPS/pextrq works with constant index.
6762    if (isa<ConstantSDNode>(Op.getOperand(1)))
6763      return Op;
6764  }
6765  return SDValue();
6766}
6767
6768
6769SDValue
6770X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
6771                                           SelectionDAG &DAG) const {
6772  if (!isa<ConstantSDNode>(Op.getOperand(1)))
6773    return SDValue();
6774
6775  SDValue Vec = Op.getOperand(0);
6776  EVT VecVT = Vec.getValueType();
6777
6778  // If this is a 256-bit vector result, first extract the 128-bit vector and
6779  // then extract the element from the 128-bit vector.
6780  if (VecVT.getSizeInBits() == 256) {
6781    DebugLoc dl = Op.getNode()->getDebugLoc();
6782    unsigned NumElems = VecVT.getVectorNumElements();
6783    SDValue Idx = Op.getOperand(1);
6784    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
6785
6786    // Get the 128-bit vector.
6787    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
6788
6789    if (IdxVal >= NumElems/2)
6790      IdxVal -= NumElems/2;
6791    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
6792                       DAG.getConstant(IdxVal, MVT::i32));
6793  }
6794
6795  assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
6796
6797  if (Subtarget->hasSSE41()) {
6798    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
6799    if (Res.getNode())
6800      return Res;
6801  }
6802
6803  EVT VT = Op.getValueType();
6804  DebugLoc dl = Op.getDebugLoc();
6805  // TODO: handle v16i8.
6806  if (VT.getSizeInBits() == 16) {
6807    SDValue Vec = Op.getOperand(0);
6808    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6809    if (Idx == 0)
6810      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
6811                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6812                                     DAG.getNode(ISD::BITCAST, dl,
6813                                                 MVT::v4i32, Vec),
6814                                     Op.getOperand(1)));
6815    // Transform it so it match pextrw which produces a 32-bit result.
6816    EVT EltVT = MVT::i32;
6817    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
6818                                    Op.getOperand(0), Op.getOperand(1));
6819    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
6820                                    DAG.getValueType(VT));
6821    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6822  }
6823
6824  if (VT.getSizeInBits() == 32) {
6825    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6826    if (Idx == 0)
6827      return Op;
6828
6829    // SHUFPS the element to the lowest double word, then movss.
6830    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
6831    EVT VVT = Op.getOperand(0).getValueType();
6832    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
6833                                       DAG.getUNDEF(VVT), Mask);
6834    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
6835                       DAG.getIntPtrConstant(0));
6836  }
6837
6838  if (VT.getSizeInBits() == 64) {
6839    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
6840    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
6841    //        to match extract_elt for f64.
6842    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6843    if (Idx == 0)
6844      return Op;
6845
6846    // UNPCKHPD the element to the lowest double word, then movsd.
6847    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
6848    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
6849    int Mask[2] = { 1, -1 };
6850    EVT VVT = Op.getOperand(0).getValueType();
6851    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
6852                                       DAG.getUNDEF(VVT), Mask);
6853    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
6854                       DAG.getIntPtrConstant(0));
6855  }
6856
6857  return SDValue();
6858}
6859
6860SDValue
6861X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
6862                                               SelectionDAG &DAG) const {
6863  EVT VT = Op.getValueType();
6864  EVT EltVT = VT.getVectorElementType();
6865  DebugLoc dl = Op.getDebugLoc();
6866
6867  SDValue N0 = Op.getOperand(0);
6868  SDValue N1 = Op.getOperand(1);
6869  SDValue N2 = Op.getOperand(2);
6870
6871  if (VT.getSizeInBits() == 256)
6872    return SDValue();
6873
6874  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
6875      isa<ConstantSDNode>(N2)) {
6876    unsigned Opc;
6877    if (VT == MVT::v8i16)
6878      Opc = X86ISD::PINSRW;
6879    else if (VT == MVT::v16i8)
6880      Opc = X86ISD::PINSRB;
6881    else
6882      Opc = X86ISD::PINSRB;
6883
6884    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
6885    // argument.
6886    if (N1.getValueType() != MVT::i32)
6887      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
6888    if (N2.getValueType() != MVT::i32)
6889      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
6890    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
6891  }
6892
6893  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
6894    // Bits [7:6] of the constant are the source select.  This will always be
6895    //  zero here.  The DAG Combiner may combine an extract_elt index into these
6896    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
6897    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
6898    // Bits [5:4] of the constant are the destination select.  This is the
6899    //  value of the incoming immediate.
6900    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
6901    //   combine either bitwise AND or insert of float 0.0 to set these bits.
6902    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
6903    // Create this as a scalar to vector..
6904    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
6905    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
6906  }
6907
6908  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
6909    // PINSR* works with constant index.
6910    return Op;
6911  }
6912  return SDValue();
6913}
6914
6915SDValue
6916X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
6917  EVT VT = Op.getValueType();
6918  EVT EltVT = VT.getVectorElementType();
6919
6920  DebugLoc dl = Op.getDebugLoc();
6921  SDValue N0 = Op.getOperand(0);
6922  SDValue N1 = Op.getOperand(1);
6923  SDValue N2 = Op.getOperand(2);
6924
6925  // If this is a 256-bit vector result, first extract the 128-bit vector,
6926  // insert the element into the extracted half and then place it back.
6927  if (VT.getSizeInBits() == 256) {
6928    if (!isa<ConstantSDNode>(N2))
6929      return SDValue();
6930
6931    // Get the desired 128-bit vector half.
6932    unsigned NumElems = VT.getVectorNumElements();
6933    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
6934    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
6935
6936    // Insert the element into the desired half.
6937    bool Upper = IdxVal >= NumElems/2;
6938    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
6939                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
6940
6941    // Insert the changed part back to the 256-bit vector
6942    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
6943  }
6944
6945  if (Subtarget->hasSSE41())
6946    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
6947
6948  if (EltVT == MVT::i8)
6949    return SDValue();
6950
6951  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
6952    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
6953    // as its second argument.
6954    if (N1.getValueType() != MVT::i32)
6955      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
6956    if (N2.getValueType() != MVT::i32)
6957      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
6958    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
6959  }
6960  return SDValue();
6961}
6962
6963SDValue
6964X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6965  LLVMContext *Context = DAG.getContext();
6966  DebugLoc dl = Op.getDebugLoc();
6967  EVT OpVT = Op.getValueType();
6968
6969  // If this is a 256-bit vector result, first insert into a 128-bit
6970  // vector and then insert into the 256-bit vector.
6971  if (OpVT.getSizeInBits() > 128) {
6972    // Insert into a 128-bit vector.
6973    EVT VT128 = EVT::getVectorVT(*Context,
6974                                 OpVT.getVectorElementType(),
6975                                 OpVT.getVectorNumElements() / 2);
6976
6977    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
6978
6979    // Insert the 128-bit vector.
6980    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
6981  }
6982
6983  if (OpVT == MVT::v1i64 &&
6984      Op.getOperand(0).getValueType() == MVT::i64)
6985    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
6986
6987  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
6988  assert(OpVT.getSizeInBits() == 128 && "Expected an SSE type!");
6989  return DAG.getNode(ISD::BITCAST, dl, OpVT,
6990                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
6991}
6992
6993// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
6994// a simple subregister reference or explicit instructions to grab
6995// upper bits of a vector.
6996SDValue
6997X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
6998  if (Subtarget->hasAVX()) {
6999    DebugLoc dl = Op.getNode()->getDebugLoc();
7000    SDValue Vec = Op.getNode()->getOperand(0);
7001    SDValue Idx = Op.getNode()->getOperand(1);
7002
7003    if (Op.getNode()->getValueType(0).getSizeInBits() == 128 &&
7004        Vec.getNode()->getValueType(0).getSizeInBits() == 256 &&
7005        isa<ConstantSDNode>(Idx)) {
7006      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7007      return Extract128BitVector(Vec, IdxVal, DAG, dl);
7008    }
7009  }
7010  return SDValue();
7011}
7012
7013// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
7014// simple superregister reference or explicit instructions to insert
7015// the upper bits of a vector.
7016SDValue
7017X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
7018  if (Subtarget->hasAVX()) {
7019    DebugLoc dl = Op.getNode()->getDebugLoc();
7020    SDValue Vec = Op.getNode()->getOperand(0);
7021    SDValue SubVec = Op.getNode()->getOperand(1);
7022    SDValue Idx = Op.getNode()->getOperand(2);
7023
7024    if (Op.getNode()->getValueType(0).getSizeInBits() == 256 &&
7025        SubVec.getNode()->getValueType(0).getSizeInBits() == 128 &&
7026        isa<ConstantSDNode>(Idx)) {
7027      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7028      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7029    }
7030  }
7031  return SDValue();
7032}
7033
7034// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7035// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7036// one of the above mentioned nodes. It has to be wrapped because otherwise
7037// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7038// be used to form addressing mode. These wrapped nodes will be selected
7039// into MOV32ri.
7040SDValue
7041X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7042  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7043
7044  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7045  // global base reg.
7046  unsigned char OpFlag = 0;
7047  unsigned WrapperKind = X86ISD::Wrapper;
7048  CodeModel::Model M = getTargetMachine().getCodeModel();
7049
7050  if (Subtarget->isPICStyleRIPRel() &&
7051      (M == CodeModel::Small || M == CodeModel::Kernel))
7052    WrapperKind = X86ISD::WrapperRIP;
7053  else if (Subtarget->isPICStyleGOT())
7054    OpFlag = X86II::MO_GOTOFF;
7055  else if (Subtarget->isPICStyleStubPIC())
7056    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7057
7058  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7059                                             CP->getAlignment(),
7060                                             CP->getOffset(), OpFlag);
7061  DebugLoc DL = CP->getDebugLoc();
7062  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7063  // With PIC, the address is actually $g + Offset.
7064  if (OpFlag) {
7065    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7066                         DAG.getNode(X86ISD::GlobalBaseReg,
7067                                     DebugLoc(), getPointerTy()),
7068                         Result);
7069  }
7070
7071  return Result;
7072}
7073
7074SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7075  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7076
7077  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7078  // global base reg.
7079  unsigned char OpFlag = 0;
7080  unsigned WrapperKind = X86ISD::Wrapper;
7081  CodeModel::Model M = getTargetMachine().getCodeModel();
7082
7083  if (Subtarget->isPICStyleRIPRel() &&
7084      (M == CodeModel::Small || M == CodeModel::Kernel))
7085    WrapperKind = X86ISD::WrapperRIP;
7086  else if (Subtarget->isPICStyleGOT())
7087    OpFlag = X86II::MO_GOTOFF;
7088  else if (Subtarget->isPICStyleStubPIC())
7089    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7090
7091  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7092                                          OpFlag);
7093  DebugLoc DL = JT->getDebugLoc();
7094  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7095
7096  // With PIC, the address is actually $g + Offset.
7097  if (OpFlag)
7098    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7099                         DAG.getNode(X86ISD::GlobalBaseReg,
7100                                     DebugLoc(), getPointerTy()),
7101                         Result);
7102
7103  return Result;
7104}
7105
7106SDValue
7107X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
7108  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
7109
7110  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7111  // global base reg.
7112  unsigned char OpFlag = 0;
7113  unsigned WrapperKind = X86ISD::Wrapper;
7114  CodeModel::Model M = getTargetMachine().getCodeModel();
7115
7116  if (Subtarget->isPICStyleRIPRel() &&
7117      (M == CodeModel::Small || M == CodeModel::Kernel)) {
7118    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
7119      OpFlag = X86II::MO_GOTPCREL;
7120    WrapperKind = X86ISD::WrapperRIP;
7121  } else if (Subtarget->isPICStyleGOT()) {
7122    OpFlag = X86II::MO_GOT;
7123  } else if (Subtarget->isPICStyleStubPIC()) {
7124    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
7125  } else if (Subtarget->isPICStyleStubNoDynamic()) {
7126    OpFlag = X86II::MO_DARWIN_NONLAZY;
7127  }
7128
7129  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
7130
7131  DebugLoc DL = Op.getDebugLoc();
7132  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7133
7134
7135  // With PIC, the address is actually $g + Offset.
7136  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
7137      !Subtarget->is64Bit()) {
7138    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7139                         DAG.getNode(X86ISD::GlobalBaseReg,
7140                                     DebugLoc(), getPointerTy()),
7141                         Result);
7142  }
7143
7144  // For symbols that require a load from a stub to get the address, emit the
7145  // load.
7146  if (isGlobalStubReference(OpFlag))
7147    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
7148                         MachinePointerInfo::getGOT(), false, false, false, 0);
7149
7150  return Result;
7151}
7152
7153SDValue
7154X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
7155  // Create the TargetBlockAddressAddress node.
7156  unsigned char OpFlags =
7157    Subtarget->ClassifyBlockAddressReference();
7158  CodeModel::Model M = getTargetMachine().getCodeModel();
7159  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
7160  DebugLoc dl = Op.getDebugLoc();
7161  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
7162                                       /*isTarget=*/true, OpFlags);
7163
7164  if (Subtarget->isPICStyleRIPRel() &&
7165      (M == CodeModel::Small || M == CodeModel::Kernel))
7166    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7167  else
7168    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7169
7170  // With PIC, the address is actually $g + Offset.
7171  if (isGlobalRelativeToPICBase(OpFlags)) {
7172    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7173                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7174                         Result);
7175  }
7176
7177  return Result;
7178}
7179
7180SDValue
7181X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
7182                                      int64_t Offset,
7183                                      SelectionDAG &DAG) const {
7184  // Create the TargetGlobalAddress node, folding in the constant
7185  // offset if it is legal.
7186  unsigned char OpFlags =
7187    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7188  CodeModel::Model M = getTargetMachine().getCodeModel();
7189  SDValue Result;
7190  if (OpFlags == X86II::MO_NO_FLAG &&
7191      X86::isOffsetSuitableForCodeModel(Offset, M)) {
7192    // A direct static reference to a global.
7193    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
7194    Offset = 0;
7195  } else {
7196    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
7197  }
7198
7199  if (Subtarget->isPICStyleRIPRel() &&
7200      (M == CodeModel::Small || M == CodeModel::Kernel))
7201    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7202  else
7203    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7204
7205  // With PIC, the address is actually $g + Offset.
7206  if (isGlobalRelativeToPICBase(OpFlags)) {
7207    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7208                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7209                         Result);
7210  }
7211
7212  // For globals that require a load from a stub to get the address, emit the
7213  // load.
7214  if (isGlobalStubReference(OpFlags))
7215    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
7216                         MachinePointerInfo::getGOT(), false, false, false, 0);
7217
7218  // If there was a non-zero offset that we didn't fold, create an explicit
7219  // addition for it.
7220  if (Offset != 0)
7221    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
7222                         DAG.getConstant(Offset, getPointerTy()));
7223
7224  return Result;
7225}
7226
7227SDValue
7228X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
7229  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7230  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
7231  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
7232}
7233
7234static SDValue
7235GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
7236           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
7237           unsigned char OperandFlags) {
7238  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7239  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7240  DebugLoc dl = GA->getDebugLoc();
7241  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7242                                           GA->getValueType(0),
7243                                           GA->getOffset(),
7244                                           OperandFlags);
7245  if (InFlag) {
7246    SDValue Ops[] = { Chain,  TGA, *InFlag };
7247    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
7248  } else {
7249    SDValue Ops[]  = { Chain, TGA };
7250    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
7251  }
7252
7253  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
7254  MFI->setAdjustsStack(true);
7255
7256  SDValue Flag = Chain.getValue(1);
7257  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
7258}
7259
7260// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
7261static SDValue
7262LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7263                                const EVT PtrVT) {
7264  SDValue InFlag;
7265  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
7266  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7267                                     DAG.getNode(X86ISD::GlobalBaseReg,
7268                                                 DebugLoc(), PtrVT), InFlag);
7269  InFlag = Chain.getValue(1);
7270
7271  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
7272}
7273
7274// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
7275static SDValue
7276LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7277                                const EVT PtrVT) {
7278  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
7279                    X86::RAX, X86II::MO_TLSGD);
7280}
7281
7282// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
7283// "local exec" model.
7284static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7285                                   const EVT PtrVT, TLSModel::Model model,
7286                                   bool is64Bit) {
7287  DebugLoc dl = GA->getDebugLoc();
7288
7289  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
7290  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
7291                                                         is64Bit ? 257 : 256));
7292
7293  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
7294                                      DAG.getIntPtrConstant(0),
7295                                      MachinePointerInfo(Ptr),
7296                                      false, false, false, 0);
7297
7298  unsigned char OperandFlags = 0;
7299  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
7300  // initialexec.
7301  unsigned WrapperKind = X86ISD::Wrapper;
7302  if (model == TLSModel::LocalExec) {
7303    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
7304  } else if (is64Bit) {
7305    assert(model == TLSModel::InitialExec);
7306    OperandFlags = X86II::MO_GOTTPOFF;
7307    WrapperKind = X86ISD::WrapperRIP;
7308  } else {
7309    assert(model == TLSModel::InitialExec);
7310    OperandFlags = X86II::MO_INDNTPOFF;
7311  }
7312
7313  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
7314  // exec)
7315  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7316                                           GA->getValueType(0),
7317                                           GA->getOffset(), OperandFlags);
7318  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7319
7320  if (model == TLSModel::InitialExec)
7321    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
7322                         MachinePointerInfo::getGOT(), false, false, false, 0);
7323
7324  // The address of the thread local variable is the add of the thread
7325  // pointer with the offset of the variable.
7326  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
7327}
7328
7329SDValue
7330X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
7331
7332  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7333  const GlobalValue *GV = GA->getGlobal();
7334
7335  if (Subtarget->isTargetELF()) {
7336    // TODO: implement the "local dynamic" model
7337    // TODO: implement the "initial exec"model for pic executables
7338
7339    // If GV is an alias then use the aliasee for determining
7340    // thread-localness.
7341    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7342      GV = GA->resolveAliasedGlobal(false);
7343
7344    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
7345
7346    switch (model) {
7347      case TLSModel::GeneralDynamic:
7348      case TLSModel::LocalDynamic: // not implemented
7349        if (Subtarget->is64Bit())
7350          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
7351        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
7352
7353      case TLSModel::InitialExec:
7354      case TLSModel::LocalExec:
7355        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
7356                                   Subtarget->is64Bit());
7357    }
7358    llvm_unreachable("Unknown TLS model.");
7359  }
7360
7361  if (Subtarget->isTargetDarwin()) {
7362    // Darwin only has one model of TLS.  Lower to that.
7363    unsigned char OpFlag = 0;
7364    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
7365                           X86ISD::WrapperRIP : X86ISD::Wrapper;
7366
7367    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7368    // global base reg.
7369    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
7370                  !Subtarget->is64Bit();
7371    if (PIC32)
7372      OpFlag = X86II::MO_TLVP_PIC_BASE;
7373    else
7374      OpFlag = X86II::MO_TLVP;
7375    DebugLoc DL = Op.getDebugLoc();
7376    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
7377                                                GA->getValueType(0),
7378                                                GA->getOffset(), OpFlag);
7379    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7380
7381    // With PIC32, the address is actually $g + Offset.
7382    if (PIC32)
7383      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7384                           DAG.getNode(X86ISD::GlobalBaseReg,
7385                                       DebugLoc(), getPointerTy()),
7386                           Offset);
7387
7388    // Lowering the machine isd will make sure everything is in the right
7389    // location.
7390    SDValue Chain = DAG.getEntryNode();
7391    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7392    SDValue Args[] = { Chain, Offset };
7393    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
7394
7395    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
7396    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7397    MFI->setAdjustsStack(true);
7398
7399    // And our return value (tls address) is in the standard call return value
7400    // location.
7401    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
7402    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
7403                              Chain.getValue(1));
7404  }
7405
7406  if (Subtarget->isTargetWindows()) {
7407    // Just use the implicit TLS architecture
7408    // Need to generate someting similar to:
7409    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
7410    //                                  ; from TEB
7411    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
7412    //   mov     rcx, qword [rdx+rcx*8]
7413    //   mov     eax, .tls$:tlsvar
7414    //   [rax+rcx] contains the address
7415    // Windows 64bit: gs:0x58
7416    // Windows 32bit: fs:__tls_array
7417
7418    // If GV is an alias then use the aliasee for determining
7419    // thread-localness.
7420    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7421      GV = GA->resolveAliasedGlobal(false);
7422    DebugLoc dl = GA->getDebugLoc();
7423    SDValue Chain = DAG.getEntryNode();
7424
7425    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
7426    // %gs:0x58 (64-bit).
7427    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
7428                                        ? Type::getInt8PtrTy(*DAG.getContext(),
7429                                                             256)
7430                                        : Type::getInt32PtrTy(*DAG.getContext(),
7431                                                              257));
7432
7433    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
7434                                        Subtarget->is64Bit()
7435                                        ? DAG.getIntPtrConstant(0x58)
7436                                        : DAG.getExternalSymbol("_tls_array",
7437                                                                getPointerTy()),
7438                                        MachinePointerInfo(Ptr),
7439                                        false, false, false, 0);
7440
7441    // Load the _tls_index variable
7442    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
7443    if (Subtarget->is64Bit())
7444      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
7445                           IDX, MachinePointerInfo(), MVT::i32,
7446                           false, false, 0);
7447    else
7448      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
7449                        false, false, false, 0);
7450
7451    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
7452                                    getPointerTy());
7453    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
7454
7455    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
7456    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
7457                      false, false, false, 0);
7458
7459    // Get the offset of start of .tls section
7460    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7461                                             GA->getValueType(0),
7462                                             GA->getOffset(), X86II::MO_SECREL);
7463    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
7464
7465    // The address of the thread local variable is the add of the thread
7466    // pointer with the offset of the variable.
7467    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
7468  }
7469
7470  llvm_unreachable("TLS not implemented for this target.");
7471}
7472
7473
7474/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
7475/// and take a 2 x i32 value to shift plus a shift amount.
7476SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
7477  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7478  EVT VT = Op.getValueType();
7479  unsigned VTBits = VT.getSizeInBits();
7480  DebugLoc dl = Op.getDebugLoc();
7481  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
7482  SDValue ShOpLo = Op.getOperand(0);
7483  SDValue ShOpHi = Op.getOperand(1);
7484  SDValue ShAmt  = Op.getOperand(2);
7485  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
7486                                     DAG.getConstant(VTBits - 1, MVT::i8))
7487                       : DAG.getConstant(0, VT);
7488
7489  SDValue Tmp2, Tmp3;
7490  if (Op.getOpcode() == ISD::SHL_PARTS) {
7491    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
7492    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7493  } else {
7494    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
7495    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
7496  }
7497
7498  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
7499                                DAG.getConstant(VTBits, MVT::i8));
7500  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7501                             AndNode, DAG.getConstant(0, MVT::i8));
7502
7503  SDValue Hi, Lo;
7504  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7505  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
7506  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
7507
7508  if (Op.getOpcode() == ISD::SHL_PARTS) {
7509    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7510    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7511  } else {
7512    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7513    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7514  }
7515
7516  SDValue Ops[2] = { Lo, Hi };
7517  return DAG.getMergeValues(Ops, 2, dl);
7518}
7519
7520SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
7521                                           SelectionDAG &DAG) const {
7522  EVT SrcVT = Op.getOperand(0).getValueType();
7523
7524  if (SrcVT.isVector())
7525    return SDValue();
7526
7527  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
7528         "Unknown SINT_TO_FP to lower!");
7529
7530  // These are really Legal; return the operand so the caller accepts it as
7531  // Legal.
7532  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
7533    return Op;
7534  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
7535      Subtarget->is64Bit()) {
7536    return Op;
7537  }
7538
7539  DebugLoc dl = Op.getDebugLoc();
7540  unsigned Size = SrcVT.getSizeInBits()/8;
7541  MachineFunction &MF = DAG.getMachineFunction();
7542  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
7543  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7544  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7545                               StackSlot,
7546                               MachinePointerInfo::getFixedStack(SSFI),
7547                               false, false, 0);
7548  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
7549}
7550
7551SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
7552                                     SDValue StackSlot,
7553                                     SelectionDAG &DAG) const {
7554  // Build the FILD
7555  DebugLoc DL = Op.getDebugLoc();
7556  SDVTList Tys;
7557  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
7558  if (useSSE)
7559    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
7560  else
7561    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
7562
7563  unsigned ByteSize = SrcVT.getSizeInBits()/8;
7564
7565  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
7566  MachineMemOperand *MMO;
7567  if (FI) {
7568    int SSFI = FI->getIndex();
7569    MMO =
7570      DAG.getMachineFunction()
7571      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7572                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
7573  } else {
7574    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
7575    StackSlot = StackSlot.getOperand(1);
7576  }
7577  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
7578  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
7579                                           X86ISD::FILD, DL,
7580                                           Tys, Ops, array_lengthof(Ops),
7581                                           SrcVT, MMO);
7582
7583  if (useSSE) {
7584    Chain = Result.getValue(1);
7585    SDValue InFlag = Result.getValue(2);
7586
7587    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
7588    // shouldn't be necessary except that RFP cannot be live across
7589    // multiple blocks. When stackifier is fixed, they can be uncoupled.
7590    MachineFunction &MF = DAG.getMachineFunction();
7591    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
7592    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
7593    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7594    Tys = DAG.getVTList(MVT::Other);
7595    SDValue Ops[] = {
7596      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
7597    };
7598    MachineMemOperand *MMO =
7599      DAG.getMachineFunction()
7600      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7601                            MachineMemOperand::MOStore, SSFISize, SSFISize);
7602
7603    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
7604                                    Ops, array_lengthof(Ops),
7605                                    Op.getValueType(), MMO);
7606    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
7607                         MachinePointerInfo::getFixedStack(SSFI),
7608                         false, false, false, 0);
7609  }
7610
7611  return Result;
7612}
7613
7614// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
7615SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
7616                                               SelectionDAG &DAG) const {
7617  // This algorithm is not obvious. Here it is what we're trying to output:
7618  /*
7619     movq       %rax,  %xmm0
7620     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
7621     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
7622     #ifdef __SSE3__
7623       haddpd   %xmm0, %xmm0
7624     #else
7625       pshufd   $0x4e, %xmm0, %xmm1
7626       addpd    %xmm1, %xmm0
7627     #endif
7628  */
7629
7630  DebugLoc dl = Op.getDebugLoc();
7631  LLVMContext *Context = DAG.getContext();
7632
7633  // Build some magic constants.
7634  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
7635  Constant *C0 = ConstantDataVector::get(*Context, CV0);
7636  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
7637
7638  SmallVector<Constant*,2> CV1;
7639  CV1.push_back(
7640        ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
7641  CV1.push_back(
7642        ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
7643  Constant *C1 = ConstantVector::get(CV1);
7644  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
7645
7646  // Load the 64-bit value into an XMM register.
7647  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
7648                            Op.getOperand(0));
7649  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
7650                              MachinePointerInfo::getConstantPool(),
7651                              false, false, false, 16);
7652  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
7653                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
7654                              CLod0);
7655
7656  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
7657                              MachinePointerInfo::getConstantPool(),
7658                              false, false, false, 16);
7659  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
7660  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
7661  SDValue Result;
7662
7663  if (Subtarget->hasSSE3()) {
7664    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
7665    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
7666  } else {
7667    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
7668    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
7669                                           S2F, 0x4E, DAG);
7670    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
7671                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
7672                         Sub);
7673  }
7674
7675  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
7676                     DAG.getIntPtrConstant(0));
7677}
7678
7679// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
7680SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
7681                                               SelectionDAG &DAG) const {
7682  DebugLoc dl = Op.getDebugLoc();
7683  // FP constant to bias correct the final result.
7684  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
7685                                   MVT::f64);
7686
7687  // Load the 32-bit value into an XMM register.
7688  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
7689                             Op.getOperand(0));
7690
7691  // Zero out the upper parts of the register.
7692  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
7693
7694  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7695                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
7696                     DAG.getIntPtrConstant(0));
7697
7698  // Or the load with the bias.
7699  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
7700                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7701                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7702                                                   MVT::v2f64, Load)),
7703                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7704                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7705                                                   MVT::v2f64, Bias)));
7706  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7707                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
7708                   DAG.getIntPtrConstant(0));
7709
7710  // Subtract the bias.
7711  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
7712
7713  // Handle final rounding.
7714  EVT DestVT = Op.getValueType();
7715
7716  if (DestVT.bitsLT(MVT::f64))
7717    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
7718                       DAG.getIntPtrConstant(0));
7719  if (DestVT.bitsGT(MVT::f64))
7720    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
7721
7722  // Handle final rounding.
7723  return Sub;
7724}
7725
7726SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
7727                                           SelectionDAG &DAG) const {
7728  SDValue N0 = Op.getOperand(0);
7729  DebugLoc dl = Op.getDebugLoc();
7730
7731  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
7732  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
7733  // the optimization here.
7734  if (DAG.SignBitIsZero(N0))
7735    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
7736
7737  EVT SrcVT = N0.getValueType();
7738  EVT DstVT = Op.getValueType();
7739  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
7740    return LowerUINT_TO_FP_i64(Op, DAG);
7741  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
7742    return LowerUINT_TO_FP_i32(Op, DAG);
7743  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
7744    return SDValue();
7745
7746  // Make a 64-bit buffer, and use it to build an FILD.
7747  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
7748  if (SrcVT == MVT::i32) {
7749    SDValue WordOff = DAG.getConstant(4, getPointerTy());
7750    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
7751                                     getPointerTy(), StackSlot, WordOff);
7752    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7753                                  StackSlot, MachinePointerInfo(),
7754                                  false, false, 0);
7755    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
7756                                  OffsetSlot, MachinePointerInfo(),
7757                                  false, false, 0);
7758    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
7759    return Fild;
7760  }
7761
7762  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
7763  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7764                               StackSlot, MachinePointerInfo(),
7765                               false, false, 0);
7766  // For i64 source, we need to add the appropriate power of 2 if the input
7767  // was negative.  This is the same as the optimization in
7768  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
7769  // we must be careful to do the computation in x87 extended precision, not
7770  // in SSE. (The generic code can't know it's OK to do this, or how to.)
7771  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
7772  MachineMemOperand *MMO =
7773    DAG.getMachineFunction()
7774    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7775                          MachineMemOperand::MOLoad, 8, 8);
7776
7777  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
7778  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
7779  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
7780                                         MVT::i64, MMO);
7781
7782  APInt FF(32, 0x5F800000ULL);
7783
7784  // Check whether the sign bit is set.
7785  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
7786                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
7787                                 ISD::SETLT);
7788
7789  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
7790  SDValue FudgePtr = DAG.getConstantPool(
7791                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
7792                                         getPointerTy());
7793
7794  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
7795  SDValue Zero = DAG.getIntPtrConstant(0);
7796  SDValue Four = DAG.getIntPtrConstant(4);
7797  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
7798                               Zero, Four);
7799  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
7800
7801  // Load the value out, extending it from f32 to f80.
7802  // FIXME: Avoid the extend by constructing the right constant pool?
7803  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
7804                                 FudgePtr, MachinePointerInfo::getConstantPool(),
7805                                 MVT::f32, false, false, 4);
7806  // Extend everything to 80 bits to force it to be done on x87.
7807  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
7808  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
7809}
7810
7811std::pair<SDValue,SDValue> X86TargetLowering::
7812FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
7813  DebugLoc DL = Op.getDebugLoc();
7814
7815  EVT DstTy = Op.getValueType();
7816
7817  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
7818    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
7819    DstTy = MVT::i64;
7820  }
7821
7822  assert(DstTy.getSimpleVT() <= MVT::i64 &&
7823         DstTy.getSimpleVT() >= MVT::i16 &&
7824         "Unknown FP_TO_INT to lower!");
7825
7826  // These are really Legal.
7827  if (DstTy == MVT::i32 &&
7828      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
7829    return std::make_pair(SDValue(), SDValue());
7830  if (Subtarget->is64Bit() &&
7831      DstTy == MVT::i64 &&
7832      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
7833    return std::make_pair(SDValue(), SDValue());
7834
7835  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
7836  // stack slot, or into the FTOL runtime function.
7837  MachineFunction &MF = DAG.getMachineFunction();
7838  unsigned MemSize = DstTy.getSizeInBits()/8;
7839  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
7840  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7841
7842  unsigned Opc;
7843  if (!IsSigned && isIntegerTypeFTOL(DstTy))
7844    Opc = X86ISD::WIN_FTOL;
7845  else
7846    switch (DstTy.getSimpleVT().SimpleTy) {
7847    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
7848    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
7849    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
7850    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
7851    }
7852
7853  SDValue Chain = DAG.getEntryNode();
7854  SDValue Value = Op.getOperand(0);
7855  EVT TheVT = Op.getOperand(0).getValueType();
7856  // FIXME This causes a redundant load/store if the SSE-class value is already
7857  // in memory, such as if it is on the callstack.
7858  if (isScalarFPTypeInSSEReg(TheVT)) {
7859    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
7860    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
7861                         MachinePointerInfo::getFixedStack(SSFI),
7862                         false, false, 0);
7863    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
7864    SDValue Ops[] = {
7865      Chain, StackSlot, DAG.getValueType(TheVT)
7866    };
7867
7868    MachineMemOperand *MMO =
7869      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7870                              MachineMemOperand::MOLoad, MemSize, MemSize);
7871    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
7872                                    DstTy, MMO);
7873    Chain = Value.getValue(1);
7874    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
7875    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7876  }
7877
7878  MachineMemOperand *MMO =
7879    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7880                            MachineMemOperand::MOStore, MemSize, MemSize);
7881
7882  if (Opc != X86ISD::WIN_FTOL) {
7883    // Build the FP_TO_INT*_IN_MEM
7884    SDValue Ops[] = { Chain, Value, StackSlot };
7885    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
7886                                           Ops, 3, DstTy, MMO);
7887    return std::make_pair(FIST, StackSlot);
7888  } else {
7889    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
7890      DAG.getVTList(MVT::Other, MVT::Glue),
7891      Chain, Value);
7892    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
7893      MVT::i32, ftol.getValue(1));
7894    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
7895      MVT::i32, eax.getValue(2));
7896    SDValue Ops[] = { eax, edx };
7897    SDValue pair = IsReplace
7898      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
7899      : DAG.getMergeValues(Ops, 2, DL);
7900    return std::make_pair(pair, SDValue());
7901  }
7902}
7903
7904SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
7905                                           SelectionDAG &DAG) const {
7906  if (Op.getValueType().isVector())
7907    return SDValue();
7908
7909  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
7910    /*IsSigned=*/ true, /*IsReplace=*/ false);
7911  SDValue FIST = Vals.first, StackSlot = Vals.second;
7912  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
7913  if (FIST.getNode() == 0) return Op;
7914
7915  if (StackSlot.getNode())
7916    // Load the result.
7917    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
7918                       FIST, StackSlot, MachinePointerInfo(),
7919                       false, false, false, 0);
7920
7921  // The node is the result.
7922  return FIST;
7923}
7924
7925SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
7926                                           SelectionDAG &DAG) const {
7927  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
7928    /*IsSigned=*/ false, /*IsReplace=*/ false);
7929  SDValue FIST = Vals.first, StackSlot = Vals.second;
7930  assert(FIST.getNode() && "Unexpected failure");
7931
7932  if (StackSlot.getNode())
7933    // Load the result.
7934    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
7935                       FIST, StackSlot, MachinePointerInfo(),
7936                       false, false, false, 0);
7937
7938  // The node is the result.
7939  return FIST;
7940}
7941
7942SDValue X86TargetLowering::LowerFABS(SDValue Op,
7943                                     SelectionDAG &DAG) const {
7944  LLVMContext *Context = DAG.getContext();
7945  DebugLoc dl = Op.getDebugLoc();
7946  EVT VT = Op.getValueType();
7947  EVT EltVT = VT;
7948  if (VT.isVector())
7949    EltVT = VT.getVectorElementType();
7950  Constant *C;
7951  if (EltVT == MVT::f64) {
7952    C = ConstantVector::getSplat(2,
7953                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
7954  } else {
7955    C = ConstantVector::getSplat(4,
7956               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
7957  }
7958  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
7959  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
7960                             MachinePointerInfo::getConstantPool(),
7961                             false, false, false, 16);
7962  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
7963}
7964
7965SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
7966  LLVMContext *Context = DAG.getContext();
7967  DebugLoc dl = Op.getDebugLoc();
7968  EVT VT = Op.getValueType();
7969  EVT EltVT = VT;
7970  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
7971  if (VT.isVector()) {
7972    EltVT = VT.getVectorElementType();
7973    NumElts = VT.getVectorNumElements();
7974  }
7975  Constant *C;
7976  if (EltVT == MVT::f64)
7977    C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
7978  else
7979    C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
7980  C = ConstantVector::getSplat(NumElts, C);
7981  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
7982  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
7983                             MachinePointerInfo::getConstantPool(),
7984                             false, false, false, 16);
7985  if (VT.isVector()) {
7986    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
7987    return DAG.getNode(ISD::BITCAST, dl, VT,
7988                       DAG.getNode(ISD::XOR, dl, XORVT,
7989                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
7990                                               Op.getOperand(0)),
7991                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
7992  }
7993
7994  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
7995}
7996
7997SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7998  LLVMContext *Context = DAG.getContext();
7999  SDValue Op0 = Op.getOperand(0);
8000  SDValue Op1 = Op.getOperand(1);
8001  DebugLoc dl = Op.getDebugLoc();
8002  EVT VT = Op.getValueType();
8003  EVT SrcVT = Op1.getValueType();
8004
8005  // If second operand is smaller, extend it first.
8006  if (SrcVT.bitsLT(VT)) {
8007    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
8008    SrcVT = VT;
8009  }
8010  // And if it is bigger, shrink it first.
8011  if (SrcVT.bitsGT(VT)) {
8012    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
8013    SrcVT = VT;
8014  }
8015
8016  // At this point the operands and the result should have the same
8017  // type, and that won't be f80 since that is not custom lowered.
8018
8019  // First get the sign bit of second operand.
8020  SmallVector<Constant*,4> CV;
8021  if (SrcVT == MVT::f64) {
8022    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
8023    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8024  } else {
8025    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
8026    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8027    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8028    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8029  }
8030  Constant *C = ConstantVector::get(CV);
8031  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8032  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
8033                              MachinePointerInfo::getConstantPool(),
8034                              false, false, false, 16);
8035  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
8036
8037  // Shift sign bit right or left if the two operands have different types.
8038  if (SrcVT.bitsGT(VT)) {
8039    // Op0 is MVT::f32, Op1 is MVT::f64.
8040    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
8041    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
8042                          DAG.getConstant(32, MVT::i32));
8043    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
8044    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
8045                          DAG.getIntPtrConstant(0));
8046  }
8047
8048  // Clear first operand sign bit.
8049  CV.clear();
8050  if (VT == MVT::f64) {
8051    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
8052    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8053  } else {
8054    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
8055    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8056    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8057    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8058  }
8059  C = ConstantVector::get(CV);
8060  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8061  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8062                              MachinePointerInfo::getConstantPool(),
8063                              false, false, false, 16);
8064  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
8065
8066  // Or the value with the sign bit.
8067  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
8068}
8069
8070SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
8071  SDValue N0 = Op.getOperand(0);
8072  DebugLoc dl = Op.getDebugLoc();
8073  EVT VT = Op.getValueType();
8074
8075  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
8076  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
8077                                  DAG.getConstant(1, VT));
8078  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
8079}
8080
8081/// Emit nodes that will be selected as "test Op0,Op0", or something
8082/// equivalent.
8083SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
8084                                    SelectionDAG &DAG) const {
8085  DebugLoc dl = Op.getDebugLoc();
8086
8087  // CF and OF aren't always set the way we want. Determine which
8088  // of these we need.
8089  bool NeedCF = false;
8090  bool NeedOF = false;
8091  switch (X86CC) {
8092  default: break;
8093  case X86::COND_A: case X86::COND_AE:
8094  case X86::COND_B: case X86::COND_BE:
8095    NeedCF = true;
8096    break;
8097  case X86::COND_G: case X86::COND_GE:
8098  case X86::COND_L: case X86::COND_LE:
8099  case X86::COND_O: case X86::COND_NO:
8100    NeedOF = true;
8101    break;
8102  }
8103
8104  // See if we can use the EFLAGS value from the operand instead of
8105  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
8106  // we prove that the arithmetic won't overflow, we can't use OF or CF.
8107  if (Op.getResNo() != 0 || NeedOF || NeedCF)
8108    // Emit a CMP with 0, which is the TEST pattern.
8109    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8110                       DAG.getConstant(0, Op.getValueType()));
8111
8112  unsigned Opcode = 0;
8113  unsigned NumOperands = 0;
8114  switch (Op.getNode()->getOpcode()) {
8115  case ISD::ADD:
8116    // Due to an isel shortcoming, be conservative if this add is likely to be
8117    // selected as part of a load-modify-store instruction. When the root node
8118    // in a match is a store, isel doesn't know how to remap non-chain non-flag
8119    // uses of other nodes in the match, such as the ADD in this case. This
8120    // leads to the ADD being left around and reselected, with the result being
8121    // two adds in the output.  Alas, even if none our users are stores, that
8122    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
8123    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
8124    // climbing the DAG back to the root, and it doesn't seem to be worth the
8125    // effort.
8126    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8127         UE = Op.getNode()->use_end(); UI != UE; ++UI)
8128      if (UI->getOpcode() != ISD::CopyToReg &&
8129          UI->getOpcode() != ISD::SETCC &&
8130          UI->getOpcode() != ISD::STORE)
8131        goto default_case;
8132
8133    if (ConstantSDNode *C =
8134        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
8135      // An add of one will be selected as an INC.
8136      if (C->getAPIntValue() == 1) {
8137        Opcode = X86ISD::INC;
8138        NumOperands = 1;
8139        break;
8140      }
8141
8142      // An add of negative one (subtract of one) will be selected as a DEC.
8143      if (C->getAPIntValue().isAllOnesValue()) {
8144        Opcode = X86ISD::DEC;
8145        NumOperands = 1;
8146        break;
8147      }
8148    }
8149
8150    // Otherwise use a regular EFLAGS-setting add.
8151    Opcode = X86ISD::ADD;
8152    NumOperands = 2;
8153    break;
8154  case ISD::AND: {
8155    // If the primary and result isn't used, don't bother using X86ISD::AND,
8156    // because a TEST instruction will be better.
8157    bool NonFlagUse = false;
8158    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8159           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
8160      SDNode *User = *UI;
8161      unsigned UOpNo = UI.getOperandNo();
8162      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
8163        // Look pass truncate.
8164        UOpNo = User->use_begin().getOperandNo();
8165        User = *User->use_begin();
8166      }
8167
8168      if (User->getOpcode() != ISD::BRCOND &&
8169          User->getOpcode() != ISD::SETCC &&
8170          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
8171        NonFlagUse = true;
8172        break;
8173      }
8174    }
8175
8176    if (!NonFlagUse)
8177      break;
8178  }
8179    // FALL THROUGH
8180  case ISD::SUB:
8181  case ISD::OR:
8182  case ISD::XOR:
8183    // Due to the ISEL shortcoming noted above, be conservative if this op is
8184    // likely to be selected as part of a load-modify-store instruction.
8185    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8186           UE = Op.getNode()->use_end(); UI != UE; ++UI)
8187      if (UI->getOpcode() == ISD::STORE)
8188        goto default_case;
8189
8190    // Otherwise use a regular EFLAGS-setting instruction.
8191    switch (Op.getNode()->getOpcode()) {
8192    default: llvm_unreachable("unexpected operator!");
8193    case ISD::SUB: Opcode = X86ISD::SUB; break;
8194    case ISD::OR:  Opcode = X86ISD::OR;  break;
8195    case ISD::XOR: Opcode = X86ISD::XOR; break;
8196    case ISD::AND: Opcode = X86ISD::AND; break;
8197    }
8198
8199    NumOperands = 2;
8200    break;
8201  case X86ISD::ADD:
8202  case X86ISD::SUB:
8203  case X86ISD::INC:
8204  case X86ISD::DEC:
8205  case X86ISD::OR:
8206  case X86ISD::XOR:
8207  case X86ISD::AND:
8208    return SDValue(Op.getNode(), 1);
8209  default:
8210  default_case:
8211    break;
8212  }
8213
8214  if (Opcode == 0)
8215    // Emit a CMP with 0, which is the TEST pattern.
8216    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8217                       DAG.getConstant(0, Op.getValueType()));
8218
8219  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
8220  SmallVector<SDValue, 4> Ops;
8221  for (unsigned i = 0; i != NumOperands; ++i)
8222    Ops.push_back(Op.getOperand(i));
8223
8224  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
8225  DAG.ReplaceAllUsesWith(Op, New);
8226  return SDValue(New.getNode(), 1);
8227}
8228
8229/// Emit nodes that will be selected as "cmp Op0,Op1", or something
8230/// equivalent.
8231SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
8232                                   SelectionDAG &DAG) const {
8233  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
8234    if (C->getAPIntValue() == 0)
8235      return EmitTest(Op0, X86CC, DAG);
8236
8237  DebugLoc dl = Op0.getDebugLoc();
8238  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
8239}
8240
8241/// Convert a comparison if required by the subtarget.
8242SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
8243                                                 SelectionDAG &DAG) const {
8244  // If the subtarget does not support the FUCOMI instruction, floating-point
8245  // comparisons have to be converted.
8246  if (Subtarget->hasCMov() ||
8247      Cmp.getOpcode() != X86ISD::CMP ||
8248      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
8249      !Cmp.getOperand(1).getValueType().isFloatingPoint())
8250    return Cmp;
8251
8252  // The instruction selector will select an FUCOM instruction instead of
8253  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
8254  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
8255  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
8256  DebugLoc dl = Cmp.getDebugLoc();
8257  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
8258  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
8259  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
8260                            DAG.getConstant(8, MVT::i8));
8261  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
8262  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
8263}
8264
8265/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
8266/// if it's possible.
8267SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
8268                                     DebugLoc dl, SelectionDAG &DAG) const {
8269  SDValue Op0 = And.getOperand(0);
8270  SDValue Op1 = And.getOperand(1);
8271  if (Op0.getOpcode() == ISD::TRUNCATE)
8272    Op0 = Op0.getOperand(0);
8273  if (Op1.getOpcode() == ISD::TRUNCATE)
8274    Op1 = Op1.getOperand(0);
8275
8276  SDValue LHS, RHS;
8277  if (Op1.getOpcode() == ISD::SHL)
8278    std::swap(Op0, Op1);
8279  if (Op0.getOpcode() == ISD::SHL) {
8280    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
8281      if (And00C->getZExtValue() == 1) {
8282        // If we looked past a truncate, check that it's only truncating away
8283        // known zeros.
8284        unsigned BitWidth = Op0.getValueSizeInBits();
8285        unsigned AndBitWidth = And.getValueSizeInBits();
8286        if (BitWidth > AndBitWidth) {
8287          APInt Zeros, Ones;
8288          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
8289          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
8290            return SDValue();
8291        }
8292        LHS = Op1;
8293        RHS = Op0.getOperand(1);
8294      }
8295  } else if (Op1.getOpcode() == ISD::Constant) {
8296    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
8297    uint64_t AndRHSVal = AndRHS->getZExtValue();
8298    SDValue AndLHS = Op0;
8299
8300    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
8301      LHS = AndLHS.getOperand(0);
8302      RHS = AndLHS.getOperand(1);
8303    }
8304
8305    // Use BT if the immediate can't be encoded in a TEST instruction.
8306    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
8307      LHS = AndLHS;
8308      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
8309    }
8310  }
8311
8312  if (LHS.getNode()) {
8313    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
8314    // instruction.  Since the shift amount is in-range-or-undefined, we know
8315    // that doing a bittest on the i32 value is ok.  We extend to i32 because
8316    // the encoding for the i16 version is larger than the i32 version.
8317    // Also promote i16 to i32 for performance / code size reason.
8318    if (LHS.getValueType() == MVT::i8 ||
8319        LHS.getValueType() == MVT::i16)
8320      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
8321
8322    // If the operand types disagree, extend the shift amount to match.  Since
8323    // BT ignores high bits (like shifts) we can use anyextend.
8324    if (LHS.getValueType() != RHS.getValueType())
8325      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
8326
8327    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
8328    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
8329    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8330                       DAG.getConstant(Cond, MVT::i8), BT);
8331  }
8332
8333  return SDValue();
8334}
8335
8336SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8337
8338  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
8339
8340  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
8341  SDValue Op0 = Op.getOperand(0);
8342  SDValue Op1 = Op.getOperand(1);
8343  DebugLoc dl = Op.getDebugLoc();
8344  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8345
8346  // Optimize to BT if possible.
8347  // Lower (X & (1 << N)) == 0 to BT(X, N).
8348  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
8349  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
8350  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
8351      Op1.getOpcode() == ISD::Constant &&
8352      cast<ConstantSDNode>(Op1)->isNullValue() &&
8353      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8354    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
8355    if (NewSetCC.getNode())
8356      return NewSetCC;
8357  }
8358
8359  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
8360  // these.
8361  if (Op1.getOpcode() == ISD::Constant &&
8362      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
8363       cast<ConstantSDNode>(Op1)->isNullValue()) &&
8364      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8365
8366    // If the input is a setcc, then reuse the input setcc or use a new one with
8367    // the inverted condition.
8368    if (Op0.getOpcode() == X86ISD::SETCC) {
8369      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
8370      bool Invert = (CC == ISD::SETNE) ^
8371        cast<ConstantSDNode>(Op1)->isNullValue();
8372      if (!Invert) return Op0;
8373
8374      CCode = X86::GetOppositeBranchCondition(CCode);
8375      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8376                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
8377    }
8378  }
8379
8380  bool isFP = Op1.getValueType().isFloatingPoint();
8381  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
8382  if (X86CC == X86::COND_INVALID)
8383    return SDValue();
8384
8385  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
8386  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
8387  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8388                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
8389}
8390
8391// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
8392// ones, and then concatenate the result back.
8393static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
8394  EVT VT = Op.getValueType();
8395
8396  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
8397         "Unsupported value type for operation");
8398
8399  unsigned NumElems = VT.getVectorNumElements();
8400  DebugLoc dl = Op.getDebugLoc();
8401  SDValue CC = Op.getOperand(2);
8402
8403  // Extract the LHS vectors
8404  SDValue LHS = Op.getOperand(0);
8405  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
8406  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
8407
8408  // Extract the RHS vectors
8409  SDValue RHS = Op.getOperand(1);
8410  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
8411  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
8412
8413  // Issue the operation on the smaller types and concatenate the result back
8414  MVT EltVT = VT.getVectorElementType().getSimpleVT();
8415  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
8416  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
8417                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
8418                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
8419}
8420
8421
8422SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
8423  SDValue Cond;
8424  SDValue Op0 = Op.getOperand(0);
8425  SDValue Op1 = Op.getOperand(1);
8426  SDValue CC = Op.getOperand(2);
8427  EVT VT = Op.getValueType();
8428  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
8429  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
8430  DebugLoc dl = Op.getDebugLoc();
8431
8432  if (isFP) {
8433    unsigned SSECC = 8;
8434    EVT EltVT = Op0.getValueType().getVectorElementType();
8435    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
8436
8437    bool Swap = false;
8438
8439    // SSE Condition code mapping:
8440    //  0 - EQ
8441    //  1 - LT
8442    //  2 - LE
8443    //  3 - UNORD
8444    //  4 - NEQ
8445    //  5 - NLT
8446    //  6 - NLE
8447    //  7 - ORD
8448    switch (SetCCOpcode) {
8449    default: break;
8450    case ISD::SETOEQ:
8451    case ISD::SETEQ:  SSECC = 0; break;
8452    case ISD::SETOGT:
8453    case ISD::SETGT: Swap = true; // Fallthrough
8454    case ISD::SETLT:
8455    case ISD::SETOLT: SSECC = 1; break;
8456    case ISD::SETOGE:
8457    case ISD::SETGE: Swap = true; // Fallthrough
8458    case ISD::SETLE:
8459    case ISD::SETOLE: SSECC = 2; break;
8460    case ISD::SETUO:  SSECC = 3; break;
8461    case ISD::SETUNE:
8462    case ISD::SETNE:  SSECC = 4; break;
8463    case ISD::SETULE: Swap = true;
8464    case ISD::SETUGE: SSECC = 5; break;
8465    case ISD::SETULT: Swap = true;
8466    case ISD::SETUGT: SSECC = 6; break;
8467    case ISD::SETO:   SSECC = 7; break;
8468    }
8469    if (Swap)
8470      std::swap(Op0, Op1);
8471
8472    // In the two special cases we can't handle, emit two comparisons.
8473    if (SSECC == 8) {
8474      if (SetCCOpcode == ISD::SETUEQ) {
8475        SDValue UNORD, EQ;
8476        UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8477                            DAG.getConstant(3, MVT::i8));
8478        EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8479                         DAG.getConstant(0, MVT::i8));
8480        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
8481      }
8482      if (SetCCOpcode == ISD::SETONE) {
8483        SDValue ORD, NEQ;
8484        ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8485                          DAG.getConstant(7, MVT::i8));
8486        NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8487                          DAG.getConstant(4, MVT::i8));
8488        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
8489      }
8490      llvm_unreachable("Illegal FP comparison");
8491    }
8492    // Handle all other FP comparisons here.
8493    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8494                       DAG.getConstant(SSECC, MVT::i8));
8495  }
8496
8497  // Break 256-bit integer vector compare into smaller ones.
8498  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
8499    return Lower256IntVSETCC(Op, DAG);
8500
8501  // We are handling one of the integer comparisons here.  Since SSE only has
8502  // GT and EQ comparisons for integer, swapping operands and multiple
8503  // operations may be required for some comparisons.
8504  unsigned Opc = 0;
8505  bool Swap = false, Invert = false, FlipSigns = false;
8506
8507  switch (SetCCOpcode) {
8508  default: break;
8509  case ISD::SETNE:  Invert = true;
8510  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
8511  case ISD::SETLT:  Swap = true;
8512  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
8513  case ISD::SETGE:  Swap = true;
8514  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
8515  case ISD::SETULT: Swap = true;
8516  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
8517  case ISD::SETUGE: Swap = true;
8518  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
8519  }
8520  if (Swap)
8521    std::swap(Op0, Op1);
8522
8523  // Check that the operation in question is available (most are plain SSE2,
8524  // but PCMPGTQ and PCMPEQQ have different requirements).
8525  if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
8526    return SDValue();
8527  if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
8528    return SDValue();
8529
8530  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
8531  // bits of the inputs before performing those operations.
8532  if (FlipSigns) {
8533    EVT EltVT = VT.getVectorElementType();
8534    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
8535                                      EltVT);
8536    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
8537    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
8538                                    SignBits.size());
8539    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
8540    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
8541  }
8542
8543  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
8544
8545  // If the logical-not of the result is required, perform that now.
8546  if (Invert)
8547    Result = DAG.getNOT(dl, Result, VT);
8548
8549  return Result;
8550}
8551
8552// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
8553static bool isX86LogicalCmp(SDValue Op) {
8554  unsigned Opc = Op.getNode()->getOpcode();
8555  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
8556      Opc == X86ISD::SAHF)
8557    return true;
8558  if (Op.getResNo() == 1 &&
8559      (Opc == X86ISD::ADD ||
8560       Opc == X86ISD::SUB ||
8561       Opc == X86ISD::ADC ||
8562       Opc == X86ISD::SBB ||
8563       Opc == X86ISD::SMUL ||
8564       Opc == X86ISD::UMUL ||
8565       Opc == X86ISD::INC ||
8566       Opc == X86ISD::DEC ||
8567       Opc == X86ISD::OR ||
8568       Opc == X86ISD::XOR ||
8569       Opc == X86ISD::AND))
8570    return true;
8571
8572  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
8573    return true;
8574
8575  return false;
8576}
8577
8578static bool isZero(SDValue V) {
8579  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8580  return C && C->isNullValue();
8581}
8582
8583static bool isAllOnes(SDValue V) {
8584  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8585  return C && C->isAllOnesValue();
8586}
8587
8588SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
8589  bool addTest = true;
8590  SDValue Cond  = Op.getOperand(0);
8591  SDValue Op1 = Op.getOperand(1);
8592  SDValue Op2 = Op.getOperand(2);
8593  DebugLoc DL = Op.getDebugLoc();
8594  SDValue CC;
8595
8596  if (Cond.getOpcode() == ISD::SETCC) {
8597    SDValue NewCond = LowerSETCC(Cond, DAG);
8598    if (NewCond.getNode())
8599      Cond = NewCond;
8600  }
8601
8602  // Handle the following cases related to max and min:
8603  // (a > b) ? (a-b) : 0
8604  // (a >= b) ? (a-b) : 0
8605  // (b < a) ? (a-b) : 0
8606  // (b <= a) ? (a-b) : 0
8607  // Comparison is removed to use EFLAGS from SUB.
8608  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2))
8609    if (Cond.getOpcode() == X86ISD::SETCC &&
8610        Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
8611        (Op1.getOpcode() == ISD::SUB || Op1.getOpcode() == X86ISD::SUB) &&
8612        C->getAPIntValue() == 0) {
8613      SDValue Cmp = Cond.getOperand(1);
8614      unsigned CC = cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
8615      if ((DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(0)) &&
8616           DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(1)) &&
8617           (CC == X86::COND_G || CC == X86::COND_GE ||
8618            CC == X86::COND_A || CC == X86::COND_AE)) ||
8619          (DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(1)) &&
8620           DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(0)) &&
8621           (CC == X86::COND_L || CC == X86::COND_LE ||
8622            CC == X86::COND_B || CC == X86::COND_BE))) {
8623
8624        if (Op1.getOpcode() == ISD::SUB) {
8625          SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i32);
8626          SDValue New = DAG.getNode(X86ISD::SUB, DL, VTs,
8627                                    Op1.getOperand(0), Op1.getOperand(1));
8628          DAG.ReplaceAllUsesWith(Op1, New);
8629          Op1 = New;
8630        }
8631
8632        SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
8633        unsigned NewCC = (CC == X86::COND_G || CC == X86::COND_GE ||
8634                          CC == X86::COND_L ||
8635                          CC == X86::COND_LE) ? X86::COND_GE : X86::COND_AE;
8636        SDValue Ops[] = { Op2, Op1, DAG.getConstant(NewCC, MVT::i8),
8637                          SDValue(Op1.getNode(), 1) };
8638        return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
8639      }
8640    }
8641
8642  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
8643  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
8644  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
8645  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
8646  if (Cond.getOpcode() == X86ISD::SETCC &&
8647      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
8648      isZero(Cond.getOperand(1).getOperand(1))) {
8649    SDValue Cmp = Cond.getOperand(1);
8650
8651    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
8652
8653    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
8654        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
8655      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
8656
8657      SDValue CmpOp0 = Cmp.getOperand(0);
8658      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
8659                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
8660      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
8661
8662      SDValue Res =   // Res = 0 or -1.
8663        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
8664                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
8665
8666      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
8667        Res = DAG.getNOT(DL, Res, Res.getValueType());
8668
8669      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
8670      if (N2C == 0 || !N2C->isNullValue())
8671        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
8672      return Res;
8673    }
8674  }
8675
8676  // Look past (and (setcc_carry (cmp ...)), 1).
8677  if (Cond.getOpcode() == ISD::AND &&
8678      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
8679    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
8680    if (C && C->getAPIntValue() == 1)
8681      Cond = Cond.getOperand(0);
8682  }
8683
8684  // If condition flag is set by a X86ISD::CMP, then use it as the condition
8685  // setting operand in place of the X86ISD::SETCC.
8686  unsigned CondOpcode = Cond.getOpcode();
8687  if (CondOpcode == X86ISD::SETCC ||
8688      CondOpcode == X86ISD::SETCC_CARRY) {
8689    CC = Cond.getOperand(0);
8690
8691    SDValue Cmp = Cond.getOperand(1);
8692    unsigned Opc = Cmp.getOpcode();
8693    EVT VT = Op.getValueType();
8694
8695    bool IllegalFPCMov = false;
8696    if (VT.isFloatingPoint() && !VT.isVector() &&
8697        !isScalarFPTypeInSSEReg(VT))  // FPStack?
8698      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
8699
8700    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
8701        Opc == X86ISD::BT) { // FIXME
8702      Cond = Cmp;
8703      addTest = false;
8704    }
8705  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
8706             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
8707             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
8708              Cond.getOperand(0).getValueType() != MVT::i8)) {
8709    SDValue LHS = Cond.getOperand(0);
8710    SDValue RHS = Cond.getOperand(1);
8711    unsigned X86Opcode;
8712    unsigned X86Cond;
8713    SDVTList VTs;
8714    switch (CondOpcode) {
8715    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
8716    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
8717    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
8718    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
8719    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
8720    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
8721    default: llvm_unreachable("unexpected overflowing operator");
8722    }
8723    if (CondOpcode == ISD::UMULO)
8724      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
8725                          MVT::i32);
8726    else
8727      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
8728
8729    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
8730
8731    if (CondOpcode == ISD::UMULO)
8732      Cond = X86Op.getValue(2);
8733    else
8734      Cond = X86Op.getValue(1);
8735
8736    CC = DAG.getConstant(X86Cond, MVT::i8);
8737    addTest = false;
8738  }
8739
8740  if (addTest) {
8741    // Look pass the truncate.
8742    if (Cond.getOpcode() == ISD::TRUNCATE)
8743      Cond = Cond.getOperand(0);
8744
8745    // We know the result of AND is compared against zero. Try to match
8746    // it to BT.
8747    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
8748      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
8749      if (NewSetCC.getNode()) {
8750        CC = NewSetCC.getOperand(0);
8751        Cond = NewSetCC.getOperand(1);
8752        addTest = false;
8753      }
8754    }
8755  }
8756
8757  if (addTest) {
8758    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
8759    Cond = EmitTest(Cond, X86::COND_NE, DAG);
8760  }
8761
8762  // a <  b ? -1 :  0 -> RES = ~setcc_carry
8763  // a <  b ?  0 : -1 -> RES = setcc_carry
8764  // a >= b ? -1 :  0 -> RES = setcc_carry
8765  // a >= b ?  0 : -1 -> RES = ~setcc_carry
8766  if (Cond.getOpcode() == X86ISD::CMP) {
8767    Cond = ConvertCmpIfNecessary(Cond, DAG);
8768    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
8769
8770    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
8771        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
8772      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
8773                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
8774      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
8775        return DAG.getNOT(DL, Res, Res.getValueType());
8776      return Res;
8777    }
8778  }
8779
8780  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
8781  // condition is true.
8782  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
8783  SDValue Ops[] = { Op2, Op1, CC, Cond };
8784  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
8785}
8786
8787// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
8788// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
8789// from the AND / OR.
8790static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
8791  Opc = Op.getOpcode();
8792  if (Opc != ISD::OR && Opc != ISD::AND)
8793    return false;
8794  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
8795          Op.getOperand(0).hasOneUse() &&
8796          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
8797          Op.getOperand(1).hasOneUse());
8798}
8799
8800// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
8801// 1 and that the SETCC node has a single use.
8802static bool isXor1OfSetCC(SDValue Op) {
8803  if (Op.getOpcode() != ISD::XOR)
8804    return false;
8805  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
8806  if (N1C && N1C->getAPIntValue() == 1) {
8807    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
8808      Op.getOperand(0).hasOneUse();
8809  }
8810  return false;
8811}
8812
8813SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
8814  bool addTest = true;
8815  SDValue Chain = Op.getOperand(0);
8816  SDValue Cond  = Op.getOperand(1);
8817  SDValue Dest  = Op.getOperand(2);
8818  DebugLoc dl = Op.getDebugLoc();
8819  SDValue CC;
8820  bool Inverted = false;
8821
8822  if (Cond.getOpcode() == ISD::SETCC) {
8823    // Check for setcc([su]{add,sub,mul}o == 0).
8824    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
8825        isa<ConstantSDNode>(Cond.getOperand(1)) &&
8826        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
8827        Cond.getOperand(0).getResNo() == 1 &&
8828        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
8829         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
8830         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
8831         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
8832         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
8833         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
8834      Inverted = true;
8835      Cond = Cond.getOperand(0);
8836    } else {
8837      SDValue NewCond = LowerSETCC(Cond, DAG);
8838      if (NewCond.getNode())
8839        Cond = NewCond;
8840    }
8841  }
8842#if 0
8843  // FIXME: LowerXALUO doesn't handle these!!
8844  else if (Cond.getOpcode() == X86ISD::ADD  ||
8845           Cond.getOpcode() == X86ISD::SUB  ||
8846           Cond.getOpcode() == X86ISD::SMUL ||
8847           Cond.getOpcode() == X86ISD::UMUL)
8848    Cond = LowerXALUO(Cond, DAG);
8849#endif
8850
8851  // Look pass (and (setcc_carry (cmp ...)), 1).
8852  if (Cond.getOpcode() == ISD::AND &&
8853      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
8854    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
8855    if (C && C->getAPIntValue() == 1)
8856      Cond = Cond.getOperand(0);
8857  }
8858
8859  // If condition flag is set by a X86ISD::CMP, then use it as the condition
8860  // setting operand in place of the X86ISD::SETCC.
8861  unsigned CondOpcode = Cond.getOpcode();
8862  if (CondOpcode == X86ISD::SETCC ||
8863      CondOpcode == X86ISD::SETCC_CARRY) {
8864    CC = Cond.getOperand(0);
8865
8866    SDValue Cmp = Cond.getOperand(1);
8867    unsigned Opc = Cmp.getOpcode();
8868    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
8869    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
8870      Cond = Cmp;
8871      addTest = false;
8872    } else {
8873      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
8874      default: break;
8875      case X86::COND_O:
8876      case X86::COND_B:
8877        // These can only come from an arithmetic instruction with overflow,
8878        // e.g. SADDO, UADDO.
8879        Cond = Cond.getNode()->getOperand(1);
8880        addTest = false;
8881        break;
8882      }
8883    }
8884  }
8885  CondOpcode = Cond.getOpcode();
8886  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
8887      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
8888      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
8889       Cond.getOperand(0).getValueType() != MVT::i8)) {
8890    SDValue LHS = Cond.getOperand(0);
8891    SDValue RHS = Cond.getOperand(1);
8892    unsigned X86Opcode;
8893    unsigned X86Cond;
8894    SDVTList VTs;
8895    switch (CondOpcode) {
8896    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
8897    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
8898    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
8899    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
8900    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
8901    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
8902    default: llvm_unreachable("unexpected overflowing operator");
8903    }
8904    if (Inverted)
8905      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
8906    if (CondOpcode == ISD::UMULO)
8907      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
8908                          MVT::i32);
8909    else
8910      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
8911
8912    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
8913
8914    if (CondOpcode == ISD::UMULO)
8915      Cond = X86Op.getValue(2);
8916    else
8917      Cond = X86Op.getValue(1);
8918
8919    CC = DAG.getConstant(X86Cond, MVT::i8);
8920    addTest = false;
8921  } else {
8922    unsigned CondOpc;
8923    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
8924      SDValue Cmp = Cond.getOperand(0).getOperand(1);
8925      if (CondOpc == ISD::OR) {
8926        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
8927        // two branches instead of an explicit OR instruction with a
8928        // separate test.
8929        if (Cmp == Cond.getOperand(1).getOperand(1) &&
8930            isX86LogicalCmp(Cmp)) {
8931          CC = Cond.getOperand(0).getOperand(0);
8932          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
8933                              Chain, Dest, CC, Cmp);
8934          CC = Cond.getOperand(1).getOperand(0);
8935          Cond = Cmp;
8936          addTest = false;
8937        }
8938      } else { // ISD::AND
8939        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
8940        // two branches instead of an explicit AND instruction with a
8941        // separate test. However, we only do this if this block doesn't
8942        // have a fall-through edge, because this requires an explicit
8943        // jmp when the condition is false.
8944        if (Cmp == Cond.getOperand(1).getOperand(1) &&
8945            isX86LogicalCmp(Cmp) &&
8946            Op.getNode()->hasOneUse()) {
8947          X86::CondCode CCode =
8948            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
8949          CCode = X86::GetOppositeBranchCondition(CCode);
8950          CC = DAG.getConstant(CCode, MVT::i8);
8951          SDNode *User = *Op.getNode()->use_begin();
8952          // Look for an unconditional branch following this conditional branch.
8953          // We need this because we need to reverse the successors in order
8954          // to implement FCMP_OEQ.
8955          if (User->getOpcode() == ISD::BR) {
8956            SDValue FalseBB = User->getOperand(1);
8957            SDNode *NewBR =
8958              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
8959            assert(NewBR == User);
8960            (void)NewBR;
8961            Dest = FalseBB;
8962
8963            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
8964                                Chain, Dest, CC, Cmp);
8965            X86::CondCode CCode =
8966              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
8967            CCode = X86::GetOppositeBranchCondition(CCode);
8968            CC = DAG.getConstant(CCode, MVT::i8);
8969            Cond = Cmp;
8970            addTest = false;
8971          }
8972        }
8973      }
8974    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
8975      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
8976      // It should be transformed during dag combiner except when the condition
8977      // is set by a arithmetics with overflow node.
8978      X86::CondCode CCode =
8979        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
8980      CCode = X86::GetOppositeBranchCondition(CCode);
8981      CC = DAG.getConstant(CCode, MVT::i8);
8982      Cond = Cond.getOperand(0).getOperand(1);
8983      addTest = false;
8984    } else if (Cond.getOpcode() == ISD::SETCC &&
8985               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
8986      // For FCMP_OEQ, we can emit
8987      // two branches instead of an explicit AND instruction with a
8988      // separate test. However, we only do this if this block doesn't
8989      // have a fall-through edge, because this requires an explicit
8990      // jmp when the condition is false.
8991      if (Op.getNode()->hasOneUse()) {
8992        SDNode *User = *Op.getNode()->use_begin();
8993        // Look for an unconditional branch following this conditional branch.
8994        // We need this because we need to reverse the successors in order
8995        // to implement FCMP_OEQ.
8996        if (User->getOpcode() == ISD::BR) {
8997          SDValue FalseBB = User->getOperand(1);
8998          SDNode *NewBR =
8999            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9000          assert(NewBR == User);
9001          (void)NewBR;
9002          Dest = FalseBB;
9003
9004          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9005                                    Cond.getOperand(0), Cond.getOperand(1));
9006          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9007          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9008          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9009                              Chain, Dest, CC, Cmp);
9010          CC = DAG.getConstant(X86::COND_P, MVT::i8);
9011          Cond = Cmp;
9012          addTest = false;
9013        }
9014      }
9015    } else if (Cond.getOpcode() == ISD::SETCC &&
9016               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
9017      // For FCMP_UNE, we can emit
9018      // two branches instead of an explicit AND instruction with a
9019      // separate test. However, we only do this if this block doesn't
9020      // have a fall-through edge, because this requires an explicit
9021      // jmp when the condition is false.
9022      if (Op.getNode()->hasOneUse()) {
9023        SDNode *User = *Op.getNode()->use_begin();
9024        // Look for an unconditional branch following this conditional branch.
9025        // We need this because we need to reverse the successors in order
9026        // to implement FCMP_UNE.
9027        if (User->getOpcode() == ISD::BR) {
9028          SDValue FalseBB = User->getOperand(1);
9029          SDNode *NewBR =
9030            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9031          assert(NewBR == User);
9032          (void)NewBR;
9033
9034          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9035                                    Cond.getOperand(0), Cond.getOperand(1));
9036          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9037          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9038          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9039                              Chain, Dest, CC, Cmp);
9040          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
9041          Cond = Cmp;
9042          addTest = false;
9043          Dest = FalseBB;
9044        }
9045      }
9046    }
9047  }
9048
9049  if (addTest) {
9050    // Look pass the truncate.
9051    if (Cond.getOpcode() == ISD::TRUNCATE)
9052      Cond = Cond.getOperand(0);
9053
9054    // We know the result of AND is compared against zero. Try to match
9055    // it to BT.
9056    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9057      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
9058      if (NewSetCC.getNode()) {
9059        CC = NewSetCC.getOperand(0);
9060        Cond = NewSetCC.getOperand(1);
9061        addTest = false;
9062      }
9063    }
9064  }
9065
9066  if (addTest) {
9067    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9068    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9069  }
9070  Cond = ConvertCmpIfNecessary(Cond, DAG);
9071  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9072                     Chain, Dest, CC, Cond);
9073}
9074
9075
9076// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
9077// Calls to _alloca is needed to probe the stack when allocating more than 4k
9078// bytes in one go. Touching the stack at 4K increments is necessary to ensure
9079// that the guard pages used by the OS virtual memory manager are allocated in
9080// correct sequence.
9081SDValue
9082X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
9083                                           SelectionDAG &DAG) const {
9084  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
9085          getTargetMachine().Options.EnableSegmentedStacks) &&
9086         "This should be used only on Windows targets or when segmented stacks "
9087         "are being used");
9088  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
9089  DebugLoc dl = Op.getDebugLoc();
9090
9091  // Get the inputs.
9092  SDValue Chain = Op.getOperand(0);
9093  SDValue Size  = Op.getOperand(1);
9094  // FIXME: Ensure alignment here
9095
9096  bool Is64Bit = Subtarget->is64Bit();
9097  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
9098
9099  if (getTargetMachine().Options.EnableSegmentedStacks) {
9100    MachineFunction &MF = DAG.getMachineFunction();
9101    MachineRegisterInfo &MRI = MF.getRegInfo();
9102
9103    if (Is64Bit) {
9104      // The 64 bit implementation of segmented stacks needs to clobber both r10
9105      // r11. This makes it impossible to use it along with nested parameters.
9106      const Function *F = MF.getFunction();
9107
9108      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
9109           I != E; I++)
9110        if (I->hasNestAttr())
9111          report_fatal_error("Cannot use segmented stacks with functions that "
9112                             "have nested arguments.");
9113    }
9114
9115    const TargetRegisterClass *AddrRegClass =
9116      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
9117    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
9118    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
9119    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
9120                                DAG.getRegister(Vreg, SPTy));
9121    SDValue Ops1[2] = { Value, Chain };
9122    return DAG.getMergeValues(Ops1, 2, dl);
9123  } else {
9124    SDValue Flag;
9125    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
9126
9127    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
9128    Flag = Chain.getValue(1);
9129    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9130
9131    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
9132    Flag = Chain.getValue(1);
9133
9134    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
9135
9136    SDValue Ops1[2] = { Chain.getValue(0), Chain };
9137    return DAG.getMergeValues(Ops1, 2, dl);
9138  }
9139}
9140
9141SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
9142  MachineFunction &MF = DAG.getMachineFunction();
9143  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
9144
9145  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9146  DebugLoc DL = Op.getDebugLoc();
9147
9148  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
9149    // vastart just stores the address of the VarArgsFrameIndex slot into the
9150    // memory location argument.
9151    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9152                                   getPointerTy());
9153    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9154                        MachinePointerInfo(SV), false, false, 0);
9155  }
9156
9157  // __va_list_tag:
9158  //   gp_offset         (0 - 6 * 8)
9159  //   fp_offset         (48 - 48 + 8 * 16)
9160  //   overflow_arg_area (point to parameters coming in memory).
9161  //   reg_save_area
9162  SmallVector<SDValue, 8> MemOps;
9163  SDValue FIN = Op.getOperand(1);
9164  // Store gp_offset
9165  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
9166                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
9167                                               MVT::i32),
9168                               FIN, MachinePointerInfo(SV), false, false, 0);
9169  MemOps.push_back(Store);
9170
9171  // Store fp_offset
9172  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9173                    FIN, DAG.getIntPtrConstant(4));
9174  Store = DAG.getStore(Op.getOperand(0), DL,
9175                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
9176                                       MVT::i32),
9177                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
9178  MemOps.push_back(Store);
9179
9180  // Store ptr to overflow_arg_area
9181  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9182                    FIN, DAG.getIntPtrConstant(4));
9183  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9184                                    getPointerTy());
9185  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
9186                       MachinePointerInfo(SV, 8),
9187                       false, false, 0);
9188  MemOps.push_back(Store);
9189
9190  // Store ptr to reg_save_area.
9191  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9192                    FIN, DAG.getIntPtrConstant(8));
9193  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
9194                                    getPointerTy());
9195  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
9196                       MachinePointerInfo(SV, 16), false, false, 0);
9197  MemOps.push_back(Store);
9198  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9199                     &MemOps[0], MemOps.size());
9200}
9201
9202SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9203  assert(Subtarget->is64Bit() &&
9204         "LowerVAARG only handles 64-bit va_arg!");
9205  assert((Subtarget->isTargetLinux() ||
9206          Subtarget->isTargetDarwin()) &&
9207          "Unhandled target in LowerVAARG");
9208  assert(Op.getNode()->getNumOperands() == 4);
9209  SDValue Chain = Op.getOperand(0);
9210  SDValue SrcPtr = Op.getOperand(1);
9211  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9212  unsigned Align = Op.getConstantOperandVal(3);
9213  DebugLoc dl = Op.getDebugLoc();
9214
9215  EVT ArgVT = Op.getNode()->getValueType(0);
9216  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9217  uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
9218  uint8_t ArgMode;
9219
9220  // Decide which area this value should be read from.
9221  // TODO: Implement the AMD64 ABI in its entirety. This simple
9222  // selection mechanism works only for the basic types.
9223  if (ArgVT == MVT::f80) {
9224    llvm_unreachable("va_arg for f80 not yet implemented");
9225  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
9226    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
9227  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
9228    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
9229  } else {
9230    llvm_unreachable("Unhandled argument type in LowerVAARG");
9231  }
9232
9233  if (ArgMode == 2) {
9234    // Sanity Check: Make sure using fp_offset makes sense.
9235    assert(!getTargetMachine().Options.UseSoftFloat &&
9236           !(DAG.getMachineFunction()
9237                .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
9238           Subtarget->hasSSE1());
9239  }
9240
9241  // Insert VAARG_64 node into the DAG
9242  // VAARG_64 returns two values: Variable Argument Address, Chain
9243  SmallVector<SDValue, 11> InstOps;
9244  InstOps.push_back(Chain);
9245  InstOps.push_back(SrcPtr);
9246  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
9247  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
9248  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
9249  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
9250  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
9251                                          VTs, &InstOps[0], InstOps.size(),
9252                                          MVT::i64,
9253                                          MachinePointerInfo(SV),
9254                                          /*Align=*/0,
9255                                          /*Volatile=*/false,
9256                                          /*ReadMem=*/true,
9257                                          /*WriteMem=*/true);
9258  Chain = VAARG.getValue(1);
9259
9260  // Load the next argument and return it
9261  return DAG.getLoad(ArgVT, dl,
9262                     Chain,
9263                     VAARG,
9264                     MachinePointerInfo(),
9265                     false, false, false, 0);
9266}
9267
9268SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
9269  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
9270  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
9271  SDValue Chain = Op.getOperand(0);
9272  SDValue DstPtr = Op.getOperand(1);
9273  SDValue SrcPtr = Op.getOperand(2);
9274  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
9275  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9276  DebugLoc DL = Op.getDebugLoc();
9277
9278  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
9279                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
9280                       false,
9281                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
9282}
9283
9284// getTargetVShiftNOde - Handle vector element shifts where the shift amount
9285// may or may not be a constant. Takes immediate version of shift as input.
9286static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
9287                                   SDValue SrcOp, SDValue ShAmt,
9288                                   SelectionDAG &DAG) {
9289  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
9290
9291  if (isa<ConstantSDNode>(ShAmt)) {
9292    switch (Opc) {
9293      default: llvm_unreachable("Unknown target vector shift node");
9294      case X86ISD::VSHLI:
9295      case X86ISD::VSRLI:
9296      case X86ISD::VSRAI:
9297        return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
9298    }
9299  }
9300
9301  // Change opcode to non-immediate version
9302  switch (Opc) {
9303    default: llvm_unreachable("Unknown target vector shift node");
9304    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
9305    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
9306    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
9307  }
9308
9309  // Need to build a vector containing shift amount
9310  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
9311  SDValue ShOps[4];
9312  ShOps[0] = ShAmt;
9313  ShOps[1] = DAG.getConstant(0, MVT::i32);
9314  ShOps[2] = DAG.getUNDEF(MVT::i32);
9315  ShOps[3] = DAG.getUNDEF(MVT::i32);
9316  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
9317  ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
9318  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
9319}
9320
9321SDValue
9322X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
9323  DebugLoc dl = Op.getDebugLoc();
9324  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9325  switch (IntNo) {
9326  default: return SDValue();    // Don't custom lower most intrinsics.
9327  // Comparison intrinsics.
9328  case Intrinsic::x86_sse_comieq_ss:
9329  case Intrinsic::x86_sse_comilt_ss:
9330  case Intrinsic::x86_sse_comile_ss:
9331  case Intrinsic::x86_sse_comigt_ss:
9332  case Intrinsic::x86_sse_comige_ss:
9333  case Intrinsic::x86_sse_comineq_ss:
9334  case Intrinsic::x86_sse_ucomieq_ss:
9335  case Intrinsic::x86_sse_ucomilt_ss:
9336  case Intrinsic::x86_sse_ucomile_ss:
9337  case Intrinsic::x86_sse_ucomigt_ss:
9338  case Intrinsic::x86_sse_ucomige_ss:
9339  case Intrinsic::x86_sse_ucomineq_ss:
9340  case Intrinsic::x86_sse2_comieq_sd:
9341  case Intrinsic::x86_sse2_comilt_sd:
9342  case Intrinsic::x86_sse2_comile_sd:
9343  case Intrinsic::x86_sse2_comigt_sd:
9344  case Intrinsic::x86_sse2_comige_sd:
9345  case Intrinsic::x86_sse2_comineq_sd:
9346  case Intrinsic::x86_sse2_ucomieq_sd:
9347  case Intrinsic::x86_sse2_ucomilt_sd:
9348  case Intrinsic::x86_sse2_ucomile_sd:
9349  case Intrinsic::x86_sse2_ucomigt_sd:
9350  case Intrinsic::x86_sse2_ucomige_sd:
9351  case Intrinsic::x86_sse2_ucomineq_sd: {
9352    unsigned Opc = 0;
9353    ISD::CondCode CC = ISD::SETCC_INVALID;
9354    switch (IntNo) {
9355    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9356    case Intrinsic::x86_sse_comieq_ss:
9357    case Intrinsic::x86_sse2_comieq_sd:
9358      Opc = X86ISD::COMI;
9359      CC = ISD::SETEQ;
9360      break;
9361    case Intrinsic::x86_sse_comilt_ss:
9362    case Intrinsic::x86_sse2_comilt_sd:
9363      Opc = X86ISD::COMI;
9364      CC = ISD::SETLT;
9365      break;
9366    case Intrinsic::x86_sse_comile_ss:
9367    case Intrinsic::x86_sse2_comile_sd:
9368      Opc = X86ISD::COMI;
9369      CC = ISD::SETLE;
9370      break;
9371    case Intrinsic::x86_sse_comigt_ss:
9372    case Intrinsic::x86_sse2_comigt_sd:
9373      Opc = X86ISD::COMI;
9374      CC = ISD::SETGT;
9375      break;
9376    case Intrinsic::x86_sse_comige_ss:
9377    case Intrinsic::x86_sse2_comige_sd:
9378      Opc = X86ISD::COMI;
9379      CC = ISD::SETGE;
9380      break;
9381    case Intrinsic::x86_sse_comineq_ss:
9382    case Intrinsic::x86_sse2_comineq_sd:
9383      Opc = X86ISD::COMI;
9384      CC = ISD::SETNE;
9385      break;
9386    case Intrinsic::x86_sse_ucomieq_ss:
9387    case Intrinsic::x86_sse2_ucomieq_sd:
9388      Opc = X86ISD::UCOMI;
9389      CC = ISD::SETEQ;
9390      break;
9391    case Intrinsic::x86_sse_ucomilt_ss:
9392    case Intrinsic::x86_sse2_ucomilt_sd:
9393      Opc = X86ISD::UCOMI;
9394      CC = ISD::SETLT;
9395      break;
9396    case Intrinsic::x86_sse_ucomile_ss:
9397    case Intrinsic::x86_sse2_ucomile_sd:
9398      Opc = X86ISD::UCOMI;
9399      CC = ISD::SETLE;
9400      break;
9401    case Intrinsic::x86_sse_ucomigt_ss:
9402    case Intrinsic::x86_sse2_ucomigt_sd:
9403      Opc = X86ISD::UCOMI;
9404      CC = ISD::SETGT;
9405      break;
9406    case Intrinsic::x86_sse_ucomige_ss:
9407    case Intrinsic::x86_sse2_ucomige_sd:
9408      Opc = X86ISD::UCOMI;
9409      CC = ISD::SETGE;
9410      break;
9411    case Intrinsic::x86_sse_ucomineq_ss:
9412    case Intrinsic::x86_sse2_ucomineq_sd:
9413      Opc = X86ISD::UCOMI;
9414      CC = ISD::SETNE;
9415      break;
9416    }
9417
9418    SDValue LHS = Op.getOperand(1);
9419    SDValue RHS = Op.getOperand(2);
9420    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
9421    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
9422    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
9423    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9424                                DAG.getConstant(X86CC, MVT::i8), Cond);
9425    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
9426  }
9427  // XOP comparison intrinsics
9428  case Intrinsic::x86_xop_vpcomltb:
9429  case Intrinsic::x86_xop_vpcomltw:
9430  case Intrinsic::x86_xop_vpcomltd:
9431  case Intrinsic::x86_xop_vpcomltq:
9432  case Intrinsic::x86_xop_vpcomltub:
9433  case Intrinsic::x86_xop_vpcomltuw:
9434  case Intrinsic::x86_xop_vpcomltud:
9435  case Intrinsic::x86_xop_vpcomltuq:
9436  case Intrinsic::x86_xop_vpcomleb:
9437  case Intrinsic::x86_xop_vpcomlew:
9438  case Intrinsic::x86_xop_vpcomled:
9439  case Intrinsic::x86_xop_vpcomleq:
9440  case Intrinsic::x86_xop_vpcomleub:
9441  case Intrinsic::x86_xop_vpcomleuw:
9442  case Intrinsic::x86_xop_vpcomleud:
9443  case Intrinsic::x86_xop_vpcomleuq:
9444  case Intrinsic::x86_xop_vpcomgtb:
9445  case Intrinsic::x86_xop_vpcomgtw:
9446  case Intrinsic::x86_xop_vpcomgtd:
9447  case Intrinsic::x86_xop_vpcomgtq:
9448  case Intrinsic::x86_xop_vpcomgtub:
9449  case Intrinsic::x86_xop_vpcomgtuw:
9450  case Intrinsic::x86_xop_vpcomgtud:
9451  case Intrinsic::x86_xop_vpcomgtuq:
9452  case Intrinsic::x86_xop_vpcomgeb:
9453  case Intrinsic::x86_xop_vpcomgew:
9454  case Intrinsic::x86_xop_vpcomged:
9455  case Intrinsic::x86_xop_vpcomgeq:
9456  case Intrinsic::x86_xop_vpcomgeub:
9457  case Intrinsic::x86_xop_vpcomgeuw:
9458  case Intrinsic::x86_xop_vpcomgeud:
9459  case Intrinsic::x86_xop_vpcomgeuq:
9460  case Intrinsic::x86_xop_vpcomeqb:
9461  case Intrinsic::x86_xop_vpcomeqw:
9462  case Intrinsic::x86_xop_vpcomeqd:
9463  case Intrinsic::x86_xop_vpcomeqq:
9464  case Intrinsic::x86_xop_vpcomequb:
9465  case Intrinsic::x86_xop_vpcomequw:
9466  case Intrinsic::x86_xop_vpcomequd:
9467  case Intrinsic::x86_xop_vpcomequq:
9468  case Intrinsic::x86_xop_vpcomneb:
9469  case Intrinsic::x86_xop_vpcomnew:
9470  case Intrinsic::x86_xop_vpcomned:
9471  case Intrinsic::x86_xop_vpcomneq:
9472  case Intrinsic::x86_xop_vpcomneub:
9473  case Intrinsic::x86_xop_vpcomneuw:
9474  case Intrinsic::x86_xop_vpcomneud:
9475  case Intrinsic::x86_xop_vpcomneuq:
9476  case Intrinsic::x86_xop_vpcomfalseb:
9477  case Intrinsic::x86_xop_vpcomfalsew:
9478  case Intrinsic::x86_xop_vpcomfalsed:
9479  case Intrinsic::x86_xop_vpcomfalseq:
9480  case Intrinsic::x86_xop_vpcomfalseub:
9481  case Intrinsic::x86_xop_vpcomfalseuw:
9482  case Intrinsic::x86_xop_vpcomfalseud:
9483  case Intrinsic::x86_xop_vpcomfalseuq:
9484  case Intrinsic::x86_xop_vpcomtrueb:
9485  case Intrinsic::x86_xop_vpcomtruew:
9486  case Intrinsic::x86_xop_vpcomtrued:
9487  case Intrinsic::x86_xop_vpcomtrueq:
9488  case Intrinsic::x86_xop_vpcomtrueub:
9489  case Intrinsic::x86_xop_vpcomtrueuw:
9490  case Intrinsic::x86_xop_vpcomtrueud:
9491  case Intrinsic::x86_xop_vpcomtrueuq: {
9492    unsigned CC = 0;
9493    unsigned Opc = 0;
9494
9495    switch (IntNo) {
9496    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9497    case Intrinsic::x86_xop_vpcomltb:
9498    case Intrinsic::x86_xop_vpcomltw:
9499    case Intrinsic::x86_xop_vpcomltd:
9500    case Intrinsic::x86_xop_vpcomltq:
9501      CC = 0;
9502      Opc = X86ISD::VPCOM;
9503      break;
9504    case Intrinsic::x86_xop_vpcomltub:
9505    case Intrinsic::x86_xop_vpcomltuw:
9506    case Intrinsic::x86_xop_vpcomltud:
9507    case Intrinsic::x86_xop_vpcomltuq:
9508      CC = 0;
9509      Opc = X86ISD::VPCOMU;
9510      break;
9511    case Intrinsic::x86_xop_vpcomleb:
9512    case Intrinsic::x86_xop_vpcomlew:
9513    case Intrinsic::x86_xop_vpcomled:
9514    case Intrinsic::x86_xop_vpcomleq:
9515      CC = 1;
9516      Opc = X86ISD::VPCOM;
9517      break;
9518    case Intrinsic::x86_xop_vpcomleub:
9519    case Intrinsic::x86_xop_vpcomleuw:
9520    case Intrinsic::x86_xop_vpcomleud:
9521    case Intrinsic::x86_xop_vpcomleuq:
9522      CC = 1;
9523      Opc = X86ISD::VPCOMU;
9524      break;
9525    case Intrinsic::x86_xop_vpcomgtb:
9526    case Intrinsic::x86_xop_vpcomgtw:
9527    case Intrinsic::x86_xop_vpcomgtd:
9528    case Intrinsic::x86_xop_vpcomgtq:
9529      CC = 2;
9530      Opc = X86ISD::VPCOM;
9531      break;
9532    case Intrinsic::x86_xop_vpcomgtub:
9533    case Intrinsic::x86_xop_vpcomgtuw:
9534    case Intrinsic::x86_xop_vpcomgtud:
9535    case Intrinsic::x86_xop_vpcomgtuq:
9536      CC = 2;
9537      Opc = X86ISD::VPCOMU;
9538      break;
9539    case Intrinsic::x86_xop_vpcomgeb:
9540    case Intrinsic::x86_xop_vpcomgew:
9541    case Intrinsic::x86_xop_vpcomged:
9542    case Intrinsic::x86_xop_vpcomgeq:
9543      CC = 3;
9544      Opc = X86ISD::VPCOM;
9545      break;
9546    case Intrinsic::x86_xop_vpcomgeub:
9547    case Intrinsic::x86_xop_vpcomgeuw:
9548    case Intrinsic::x86_xop_vpcomgeud:
9549    case Intrinsic::x86_xop_vpcomgeuq:
9550      CC = 3;
9551      Opc = X86ISD::VPCOMU;
9552      break;
9553    case Intrinsic::x86_xop_vpcomeqb:
9554    case Intrinsic::x86_xop_vpcomeqw:
9555    case Intrinsic::x86_xop_vpcomeqd:
9556    case Intrinsic::x86_xop_vpcomeqq:
9557      CC = 4;
9558      Opc = X86ISD::VPCOM;
9559      break;
9560    case Intrinsic::x86_xop_vpcomequb:
9561    case Intrinsic::x86_xop_vpcomequw:
9562    case Intrinsic::x86_xop_vpcomequd:
9563    case Intrinsic::x86_xop_vpcomequq:
9564      CC = 4;
9565      Opc = X86ISD::VPCOMU;
9566      break;
9567    case Intrinsic::x86_xop_vpcomneb:
9568    case Intrinsic::x86_xop_vpcomnew:
9569    case Intrinsic::x86_xop_vpcomned:
9570    case Intrinsic::x86_xop_vpcomneq:
9571      CC = 5;
9572      Opc = X86ISD::VPCOM;
9573      break;
9574    case Intrinsic::x86_xop_vpcomneub:
9575    case Intrinsic::x86_xop_vpcomneuw:
9576    case Intrinsic::x86_xop_vpcomneud:
9577    case Intrinsic::x86_xop_vpcomneuq:
9578      CC = 5;
9579      Opc = X86ISD::VPCOMU;
9580      break;
9581    case Intrinsic::x86_xop_vpcomfalseb:
9582    case Intrinsic::x86_xop_vpcomfalsew:
9583    case Intrinsic::x86_xop_vpcomfalsed:
9584    case Intrinsic::x86_xop_vpcomfalseq:
9585      CC = 6;
9586      Opc = X86ISD::VPCOM;
9587      break;
9588    case Intrinsic::x86_xop_vpcomfalseub:
9589    case Intrinsic::x86_xop_vpcomfalseuw:
9590    case Intrinsic::x86_xop_vpcomfalseud:
9591    case Intrinsic::x86_xop_vpcomfalseuq:
9592      CC = 6;
9593      Opc = X86ISD::VPCOMU;
9594      break;
9595    case Intrinsic::x86_xop_vpcomtrueb:
9596    case Intrinsic::x86_xop_vpcomtruew:
9597    case Intrinsic::x86_xop_vpcomtrued:
9598    case Intrinsic::x86_xop_vpcomtrueq:
9599      CC = 7;
9600      Opc = X86ISD::VPCOM;
9601      break;
9602    case Intrinsic::x86_xop_vpcomtrueub:
9603    case Intrinsic::x86_xop_vpcomtrueuw:
9604    case Intrinsic::x86_xop_vpcomtrueud:
9605    case Intrinsic::x86_xop_vpcomtrueuq:
9606      CC = 7;
9607      Opc = X86ISD::VPCOMU;
9608      break;
9609    }
9610
9611    SDValue LHS = Op.getOperand(1);
9612    SDValue RHS = Op.getOperand(2);
9613    return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
9614                       DAG.getConstant(CC, MVT::i8));
9615  }
9616
9617  // Arithmetic intrinsics.
9618  case Intrinsic::x86_sse2_pmulu_dq:
9619  case Intrinsic::x86_avx2_pmulu_dq:
9620    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
9621                       Op.getOperand(1), Op.getOperand(2));
9622  case Intrinsic::x86_sse3_hadd_ps:
9623  case Intrinsic::x86_sse3_hadd_pd:
9624  case Intrinsic::x86_avx_hadd_ps_256:
9625  case Intrinsic::x86_avx_hadd_pd_256:
9626    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
9627                       Op.getOperand(1), Op.getOperand(2));
9628  case Intrinsic::x86_sse3_hsub_ps:
9629  case Intrinsic::x86_sse3_hsub_pd:
9630  case Intrinsic::x86_avx_hsub_ps_256:
9631  case Intrinsic::x86_avx_hsub_pd_256:
9632    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
9633                       Op.getOperand(1), Op.getOperand(2));
9634  case Intrinsic::x86_ssse3_phadd_w_128:
9635  case Intrinsic::x86_ssse3_phadd_d_128:
9636  case Intrinsic::x86_avx2_phadd_w:
9637  case Intrinsic::x86_avx2_phadd_d:
9638    return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
9639                       Op.getOperand(1), Op.getOperand(2));
9640  case Intrinsic::x86_ssse3_phsub_w_128:
9641  case Intrinsic::x86_ssse3_phsub_d_128:
9642  case Intrinsic::x86_avx2_phsub_w:
9643  case Intrinsic::x86_avx2_phsub_d:
9644    return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
9645                       Op.getOperand(1), Op.getOperand(2));
9646  case Intrinsic::x86_avx2_psllv_d:
9647  case Intrinsic::x86_avx2_psllv_q:
9648  case Intrinsic::x86_avx2_psllv_d_256:
9649  case Intrinsic::x86_avx2_psllv_q_256:
9650    return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
9651                      Op.getOperand(1), Op.getOperand(2));
9652  case Intrinsic::x86_avx2_psrlv_d:
9653  case Intrinsic::x86_avx2_psrlv_q:
9654  case Intrinsic::x86_avx2_psrlv_d_256:
9655  case Intrinsic::x86_avx2_psrlv_q_256:
9656    return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
9657                      Op.getOperand(1), Op.getOperand(2));
9658  case Intrinsic::x86_avx2_psrav_d:
9659  case Intrinsic::x86_avx2_psrav_d_256:
9660    return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
9661                      Op.getOperand(1), Op.getOperand(2));
9662  case Intrinsic::x86_ssse3_pshuf_b_128:
9663  case Intrinsic::x86_avx2_pshuf_b:
9664    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
9665                       Op.getOperand(1), Op.getOperand(2));
9666  case Intrinsic::x86_ssse3_psign_b_128:
9667  case Intrinsic::x86_ssse3_psign_w_128:
9668  case Intrinsic::x86_ssse3_psign_d_128:
9669  case Intrinsic::x86_avx2_psign_b:
9670  case Intrinsic::x86_avx2_psign_w:
9671  case Intrinsic::x86_avx2_psign_d:
9672    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
9673                       Op.getOperand(1), Op.getOperand(2));
9674  case Intrinsic::x86_sse41_insertps:
9675    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
9676                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
9677  case Intrinsic::x86_avx_vperm2f128_ps_256:
9678  case Intrinsic::x86_avx_vperm2f128_pd_256:
9679  case Intrinsic::x86_avx_vperm2f128_si_256:
9680  case Intrinsic::x86_avx2_vperm2i128:
9681    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
9682                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
9683  case Intrinsic::x86_avx2_permd:
9684  case Intrinsic::x86_avx2_permps:
9685    // Operands intentionally swapped. Mask is last operand to intrinsic,
9686    // but second operand for node/intruction.
9687    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
9688                       Op.getOperand(2), Op.getOperand(1));
9689
9690  // ptest and testp intrinsics. The intrinsic these come from are designed to
9691  // return an integer value, not just an instruction so lower it to the ptest
9692  // or testp pattern and a setcc for the result.
9693  case Intrinsic::x86_sse41_ptestz:
9694  case Intrinsic::x86_sse41_ptestc:
9695  case Intrinsic::x86_sse41_ptestnzc:
9696  case Intrinsic::x86_avx_ptestz_256:
9697  case Intrinsic::x86_avx_ptestc_256:
9698  case Intrinsic::x86_avx_ptestnzc_256:
9699  case Intrinsic::x86_avx_vtestz_ps:
9700  case Intrinsic::x86_avx_vtestc_ps:
9701  case Intrinsic::x86_avx_vtestnzc_ps:
9702  case Intrinsic::x86_avx_vtestz_pd:
9703  case Intrinsic::x86_avx_vtestc_pd:
9704  case Intrinsic::x86_avx_vtestnzc_pd:
9705  case Intrinsic::x86_avx_vtestz_ps_256:
9706  case Intrinsic::x86_avx_vtestc_ps_256:
9707  case Intrinsic::x86_avx_vtestnzc_ps_256:
9708  case Intrinsic::x86_avx_vtestz_pd_256:
9709  case Intrinsic::x86_avx_vtestc_pd_256:
9710  case Intrinsic::x86_avx_vtestnzc_pd_256: {
9711    bool IsTestPacked = false;
9712    unsigned X86CC = 0;
9713    switch (IntNo) {
9714    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
9715    case Intrinsic::x86_avx_vtestz_ps:
9716    case Intrinsic::x86_avx_vtestz_pd:
9717    case Intrinsic::x86_avx_vtestz_ps_256:
9718    case Intrinsic::x86_avx_vtestz_pd_256:
9719      IsTestPacked = true; // Fallthrough
9720    case Intrinsic::x86_sse41_ptestz:
9721    case Intrinsic::x86_avx_ptestz_256:
9722      // ZF = 1
9723      X86CC = X86::COND_E;
9724      break;
9725    case Intrinsic::x86_avx_vtestc_ps:
9726    case Intrinsic::x86_avx_vtestc_pd:
9727    case Intrinsic::x86_avx_vtestc_ps_256:
9728    case Intrinsic::x86_avx_vtestc_pd_256:
9729      IsTestPacked = true; // Fallthrough
9730    case Intrinsic::x86_sse41_ptestc:
9731    case Intrinsic::x86_avx_ptestc_256:
9732      // CF = 1
9733      X86CC = X86::COND_B;
9734      break;
9735    case Intrinsic::x86_avx_vtestnzc_ps:
9736    case Intrinsic::x86_avx_vtestnzc_pd:
9737    case Intrinsic::x86_avx_vtestnzc_ps_256:
9738    case Intrinsic::x86_avx_vtestnzc_pd_256:
9739      IsTestPacked = true; // Fallthrough
9740    case Intrinsic::x86_sse41_ptestnzc:
9741    case Intrinsic::x86_avx_ptestnzc_256:
9742      // ZF and CF = 0
9743      X86CC = X86::COND_A;
9744      break;
9745    }
9746
9747    SDValue LHS = Op.getOperand(1);
9748    SDValue RHS = Op.getOperand(2);
9749    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
9750    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
9751    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
9752    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
9753    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
9754  }
9755
9756  // SSE/AVX shift intrinsics
9757  case Intrinsic::x86_sse2_psll_w:
9758  case Intrinsic::x86_sse2_psll_d:
9759  case Intrinsic::x86_sse2_psll_q:
9760  case Intrinsic::x86_avx2_psll_w:
9761  case Intrinsic::x86_avx2_psll_d:
9762  case Intrinsic::x86_avx2_psll_q:
9763    return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
9764                       Op.getOperand(1), Op.getOperand(2));
9765  case Intrinsic::x86_sse2_psrl_w:
9766  case Intrinsic::x86_sse2_psrl_d:
9767  case Intrinsic::x86_sse2_psrl_q:
9768  case Intrinsic::x86_avx2_psrl_w:
9769  case Intrinsic::x86_avx2_psrl_d:
9770  case Intrinsic::x86_avx2_psrl_q:
9771    return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
9772                       Op.getOperand(1), Op.getOperand(2));
9773  case Intrinsic::x86_sse2_psra_w:
9774  case Intrinsic::x86_sse2_psra_d:
9775  case Intrinsic::x86_avx2_psra_w:
9776  case Intrinsic::x86_avx2_psra_d:
9777    return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
9778                       Op.getOperand(1), Op.getOperand(2));
9779  case Intrinsic::x86_sse2_pslli_w:
9780  case Intrinsic::x86_sse2_pslli_d:
9781  case Intrinsic::x86_sse2_pslli_q:
9782  case Intrinsic::x86_avx2_pslli_w:
9783  case Intrinsic::x86_avx2_pslli_d:
9784  case Intrinsic::x86_avx2_pslli_q:
9785    return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
9786                               Op.getOperand(1), Op.getOperand(2), DAG);
9787  case Intrinsic::x86_sse2_psrli_w:
9788  case Intrinsic::x86_sse2_psrli_d:
9789  case Intrinsic::x86_sse2_psrli_q:
9790  case Intrinsic::x86_avx2_psrli_w:
9791  case Intrinsic::x86_avx2_psrli_d:
9792  case Intrinsic::x86_avx2_psrli_q:
9793    return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
9794                               Op.getOperand(1), Op.getOperand(2), DAG);
9795  case Intrinsic::x86_sse2_psrai_w:
9796  case Intrinsic::x86_sse2_psrai_d:
9797  case Intrinsic::x86_avx2_psrai_w:
9798  case Intrinsic::x86_avx2_psrai_d:
9799    return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
9800                               Op.getOperand(1), Op.getOperand(2), DAG);
9801  // Fix vector shift instructions where the last operand is a non-immediate
9802  // i32 value.
9803  case Intrinsic::x86_mmx_pslli_w:
9804  case Intrinsic::x86_mmx_pslli_d:
9805  case Intrinsic::x86_mmx_pslli_q:
9806  case Intrinsic::x86_mmx_psrli_w:
9807  case Intrinsic::x86_mmx_psrli_d:
9808  case Intrinsic::x86_mmx_psrli_q:
9809  case Intrinsic::x86_mmx_psrai_w:
9810  case Intrinsic::x86_mmx_psrai_d: {
9811    SDValue ShAmt = Op.getOperand(2);
9812    if (isa<ConstantSDNode>(ShAmt))
9813      return SDValue();
9814
9815    unsigned NewIntNo = 0;
9816    switch (IntNo) {
9817    case Intrinsic::x86_mmx_pslli_w:
9818      NewIntNo = Intrinsic::x86_mmx_psll_w;
9819      break;
9820    case Intrinsic::x86_mmx_pslli_d:
9821      NewIntNo = Intrinsic::x86_mmx_psll_d;
9822      break;
9823    case Intrinsic::x86_mmx_pslli_q:
9824      NewIntNo = Intrinsic::x86_mmx_psll_q;
9825      break;
9826    case Intrinsic::x86_mmx_psrli_w:
9827      NewIntNo = Intrinsic::x86_mmx_psrl_w;
9828      break;
9829    case Intrinsic::x86_mmx_psrli_d:
9830      NewIntNo = Intrinsic::x86_mmx_psrl_d;
9831      break;
9832    case Intrinsic::x86_mmx_psrli_q:
9833      NewIntNo = Intrinsic::x86_mmx_psrl_q;
9834      break;
9835    case Intrinsic::x86_mmx_psrai_w:
9836      NewIntNo = Intrinsic::x86_mmx_psra_w;
9837      break;
9838    case Intrinsic::x86_mmx_psrai_d:
9839      NewIntNo = Intrinsic::x86_mmx_psra_d;
9840      break;
9841    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9842    }
9843
9844    // The vector shift intrinsics with scalars uses 32b shift amounts but
9845    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
9846    // to be zero.
9847    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
9848                         DAG.getConstant(0, MVT::i32));
9849// FIXME this must be lowered to get rid of the invalid type.
9850
9851    EVT VT = Op.getValueType();
9852    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
9853    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9854                       DAG.getConstant(NewIntNo, MVT::i32),
9855                       Op.getOperand(1), ShAmt);
9856  }
9857  }
9858}
9859
9860SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
9861                                           SelectionDAG &DAG) const {
9862  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
9863  MFI->setReturnAddressIsTaken(true);
9864
9865  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9866  DebugLoc dl = Op.getDebugLoc();
9867
9868  if (Depth > 0) {
9869    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
9870    SDValue Offset =
9871      DAG.getConstant(TD->getPointerSize(),
9872                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
9873    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
9874                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
9875                                   FrameAddr, Offset),
9876                       MachinePointerInfo(), false, false, false, 0);
9877  }
9878
9879  // Just load the return address.
9880  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
9881  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
9882                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
9883}
9884
9885SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
9886  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
9887  MFI->setFrameAddressIsTaken(true);
9888
9889  EVT VT = Op.getValueType();
9890  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
9891  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9892  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
9893  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
9894  while (Depth--)
9895    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
9896                            MachinePointerInfo(),
9897                            false, false, false, 0);
9898  return FrameAddr;
9899}
9900
9901SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
9902                                                     SelectionDAG &DAG) const {
9903  return DAG.getIntPtrConstant(2*TD->getPointerSize());
9904}
9905
9906SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
9907  MachineFunction &MF = DAG.getMachineFunction();
9908  SDValue Chain     = Op.getOperand(0);
9909  SDValue Offset    = Op.getOperand(1);
9910  SDValue Handler   = Op.getOperand(2);
9911  DebugLoc dl       = Op.getDebugLoc();
9912
9913  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
9914                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
9915                                     getPointerTy());
9916  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
9917
9918  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
9919                                  DAG.getIntPtrConstant(TD->getPointerSize()));
9920  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
9921  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
9922                       false, false, 0);
9923  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
9924  MF.getRegInfo().addLiveOut(StoreAddrReg);
9925
9926  return DAG.getNode(X86ISD::EH_RETURN, dl,
9927                     MVT::Other,
9928                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
9929}
9930
9931SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
9932                                                  SelectionDAG &DAG) const {
9933  return Op.getOperand(0);
9934}
9935
9936SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
9937                                                SelectionDAG &DAG) const {
9938  SDValue Root = Op.getOperand(0);
9939  SDValue Trmp = Op.getOperand(1); // trampoline
9940  SDValue FPtr = Op.getOperand(2); // nested function
9941  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
9942  DebugLoc dl  = Op.getDebugLoc();
9943
9944  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9945
9946  if (Subtarget->is64Bit()) {
9947    SDValue OutChains[6];
9948
9949    // Large code-model.
9950    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
9951    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
9952
9953    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
9954    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
9955
9956    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
9957
9958    // Load the pointer to the nested function into R11.
9959    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
9960    SDValue Addr = Trmp;
9961    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
9962                                Addr, MachinePointerInfo(TrmpAddr),
9963                                false, false, 0);
9964
9965    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9966                       DAG.getConstant(2, MVT::i64));
9967    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
9968                                MachinePointerInfo(TrmpAddr, 2),
9969                                false, false, 2);
9970
9971    // Load the 'nest' parameter value into R10.
9972    // R10 is specified in X86CallingConv.td
9973    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
9974    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9975                       DAG.getConstant(10, MVT::i64));
9976    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
9977                                Addr, MachinePointerInfo(TrmpAddr, 10),
9978                                false, false, 0);
9979
9980    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9981                       DAG.getConstant(12, MVT::i64));
9982    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
9983                                MachinePointerInfo(TrmpAddr, 12),
9984                                false, false, 2);
9985
9986    // Jump to the nested function.
9987    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
9988    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9989                       DAG.getConstant(20, MVT::i64));
9990    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
9991                                Addr, MachinePointerInfo(TrmpAddr, 20),
9992                                false, false, 0);
9993
9994    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
9995    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9996                       DAG.getConstant(22, MVT::i64));
9997    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
9998                                MachinePointerInfo(TrmpAddr, 22),
9999                                false, false, 0);
10000
10001    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
10002  } else {
10003    const Function *Func =
10004      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
10005    CallingConv::ID CC = Func->getCallingConv();
10006    unsigned NestReg;
10007
10008    switch (CC) {
10009    default:
10010      llvm_unreachable("Unsupported calling convention");
10011    case CallingConv::C:
10012    case CallingConv::X86_StdCall: {
10013      // Pass 'nest' parameter in ECX.
10014      // Must be kept in sync with X86CallingConv.td
10015      NestReg = X86::ECX;
10016
10017      // Check that ECX wasn't needed by an 'inreg' parameter.
10018      FunctionType *FTy = Func->getFunctionType();
10019      const AttrListPtr &Attrs = Func->getAttributes();
10020
10021      if (!Attrs.isEmpty() && !Func->isVarArg()) {
10022        unsigned InRegCount = 0;
10023        unsigned Idx = 1;
10024
10025        for (FunctionType::param_iterator I = FTy->param_begin(),
10026             E = FTy->param_end(); I != E; ++I, ++Idx)
10027          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
10028            // FIXME: should only count parameters that are lowered to integers.
10029            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
10030
10031        if (InRegCount > 2) {
10032          report_fatal_error("Nest register in use - reduce number of inreg"
10033                             " parameters!");
10034        }
10035      }
10036      break;
10037    }
10038    case CallingConv::X86_FastCall:
10039    case CallingConv::X86_ThisCall:
10040    case CallingConv::Fast:
10041      // Pass 'nest' parameter in EAX.
10042      // Must be kept in sync with X86CallingConv.td
10043      NestReg = X86::EAX;
10044      break;
10045    }
10046
10047    SDValue OutChains[4];
10048    SDValue Addr, Disp;
10049
10050    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10051                       DAG.getConstant(10, MVT::i32));
10052    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
10053
10054    // This is storing the opcode for MOV32ri.
10055    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
10056    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
10057    OutChains[0] = DAG.getStore(Root, dl,
10058                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
10059                                Trmp, MachinePointerInfo(TrmpAddr),
10060                                false, false, 0);
10061
10062    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10063                       DAG.getConstant(1, MVT::i32));
10064    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
10065                                MachinePointerInfo(TrmpAddr, 1),
10066                                false, false, 1);
10067
10068    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
10069    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10070                       DAG.getConstant(5, MVT::i32));
10071    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
10072                                MachinePointerInfo(TrmpAddr, 5),
10073                                false, false, 1);
10074
10075    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10076                       DAG.getConstant(6, MVT::i32));
10077    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
10078                                MachinePointerInfo(TrmpAddr, 6),
10079                                false, false, 1);
10080
10081    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
10082  }
10083}
10084
10085SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
10086                                            SelectionDAG &DAG) const {
10087  /*
10088   The rounding mode is in bits 11:10 of FPSR, and has the following
10089   settings:
10090     00 Round to nearest
10091     01 Round to -inf
10092     10 Round to +inf
10093     11 Round to 0
10094
10095  FLT_ROUNDS, on the other hand, expects the following:
10096    -1 Undefined
10097     0 Round to 0
10098     1 Round to nearest
10099     2 Round to +inf
10100     3 Round to -inf
10101
10102  To perform the conversion, we do:
10103    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
10104  */
10105
10106  MachineFunction &MF = DAG.getMachineFunction();
10107  const TargetMachine &TM = MF.getTarget();
10108  const TargetFrameLowering &TFI = *TM.getFrameLowering();
10109  unsigned StackAlignment = TFI.getStackAlignment();
10110  EVT VT = Op.getValueType();
10111  DebugLoc DL = Op.getDebugLoc();
10112
10113  // Save FP Control Word to stack slot
10114  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
10115  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
10116
10117
10118  MachineMemOperand *MMO =
10119   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
10120                           MachineMemOperand::MOStore, 2, 2);
10121
10122  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
10123  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
10124                                          DAG.getVTList(MVT::Other),
10125                                          Ops, 2, MVT::i16, MMO);
10126
10127  // Load FP Control Word from stack slot
10128  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
10129                            MachinePointerInfo(), false, false, false, 0);
10130
10131  // Transform as necessary
10132  SDValue CWD1 =
10133    DAG.getNode(ISD::SRL, DL, MVT::i16,
10134                DAG.getNode(ISD::AND, DL, MVT::i16,
10135                            CWD, DAG.getConstant(0x800, MVT::i16)),
10136                DAG.getConstant(11, MVT::i8));
10137  SDValue CWD2 =
10138    DAG.getNode(ISD::SRL, DL, MVT::i16,
10139                DAG.getNode(ISD::AND, DL, MVT::i16,
10140                            CWD, DAG.getConstant(0x400, MVT::i16)),
10141                DAG.getConstant(9, MVT::i8));
10142
10143  SDValue RetVal =
10144    DAG.getNode(ISD::AND, DL, MVT::i16,
10145                DAG.getNode(ISD::ADD, DL, MVT::i16,
10146                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
10147                            DAG.getConstant(1, MVT::i16)),
10148                DAG.getConstant(3, MVT::i16));
10149
10150
10151  return DAG.getNode((VT.getSizeInBits() < 16 ?
10152                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
10153}
10154
10155SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
10156  EVT VT = Op.getValueType();
10157  EVT OpVT = VT;
10158  unsigned NumBits = VT.getSizeInBits();
10159  DebugLoc dl = Op.getDebugLoc();
10160
10161  Op = Op.getOperand(0);
10162  if (VT == MVT::i8) {
10163    // Zero extend to i32 since there is not an i8 bsr.
10164    OpVT = MVT::i32;
10165    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
10166  }
10167
10168  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
10169  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
10170  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
10171
10172  // If src is zero (i.e. bsr sets ZF), returns NumBits.
10173  SDValue Ops[] = {
10174    Op,
10175    DAG.getConstant(NumBits+NumBits-1, OpVT),
10176    DAG.getConstant(X86::COND_E, MVT::i8),
10177    Op.getValue(1)
10178  };
10179  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
10180
10181  // Finally xor with NumBits-1.
10182  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
10183
10184  if (VT == MVT::i8)
10185    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
10186  return Op;
10187}
10188
10189SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
10190                                                SelectionDAG &DAG) const {
10191  EVT VT = Op.getValueType();
10192  EVT OpVT = VT;
10193  unsigned NumBits = VT.getSizeInBits();
10194  DebugLoc dl = Op.getDebugLoc();
10195
10196  Op = Op.getOperand(0);
10197  if (VT == MVT::i8) {
10198    // Zero extend to i32 since there is not an i8 bsr.
10199    OpVT = MVT::i32;
10200    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
10201  }
10202
10203  // Issue a bsr (scan bits in reverse).
10204  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
10205  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
10206
10207  // And xor with NumBits-1.
10208  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
10209
10210  if (VT == MVT::i8)
10211    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
10212  return Op;
10213}
10214
10215SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10216  EVT VT = Op.getValueType();
10217  unsigned NumBits = VT.getSizeInBits();
10218  DebugLoc dl = Op.getDebugLoc();
10219  Op = Op.getOperand(0);
10220
10221  // Issue a bsf (scan bits forward) which also sets EFLAGS.
10222  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
10223  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
10224
10225  // If src is zero (i.e. bsf sets ZF), returns NumBits.
10226  SDValue Ops[] = {
10227    Op,
10228    DAG.getConstant(NumBits, VT),
10229    DAG.getConstant(X86::COND_E, MVT::i8),
10230    Op.getValue(1)
10231  };
10232  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
10233}
10234
10235// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
10236// ones, and then concatenate the result back.
10237static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
10238  EVT VT = Op.getValueType();
10239
10240  assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
10241         "Unsupported value type for operation");
10242
10243  unsigned NumElems = VT.getVectorNumElements();
10244  DebugLoc dl = Op.getDebugLoc();
10245
10246  // Extract the LHS vectors
10247  SDValue LHS = Op.getOperand(0);
10248  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
10249  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
10250
10251  // Extract the RHS vectors
10252  SDValue RHS = Op.getOperand(1);
10253  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
10254  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
10255
10256  MVT EltVT = VT.getVectorElementType().getSimpleVT();
10257  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10258
10259  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
10260                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
10261                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
10262}
10263
10264SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
10265  assert(Op.getValueType().getSizeInBits() == 256 &&
10266         Op.getValueType().isInteger() &&
10267         "Only handle AVX 256-bit vector integer operation");
10268  return Lower256IntArith(Op, DAG);
10269}
10270
10271SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
10272  assert(Op.getValueType().getSizeInBits() == 256 &&
10273         Op.getValueType().isInteger() &&
10274         "Only handle AVX 256-bit vector integer operation");
10275  return Lower256IntArith(Op, DAG);
10276}
10277
10278SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
10279  EVT VT = Op.getValueType();
10280
10281  // Decompose 256-bit ops into smaller 128-bit ops.
10282  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
10283    return Lower256IntArith(Op, DAG);
10284
10285  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
10286         "Only know how to lower V2I64/V4I64 multiply");
10287
10288  DebugLoc dl = Op.getDebugLoc();
10289
10290  //  Ahi = psrlqi(a, 32);
10291  //  Bhi = psrlqi(b, 32);
10292  //
10293  //  AloBlo = pmuludq(a, b);
10294  //  AloBhi = pmuludq(a, Bhi);
10295  //  AhiBlo = pmuludq(Ahi, b);
10296
10297  //  AloBhi = psllqi(AloBhi, 32);
10298  //  AhiBlo = psllqi(AhiBlo, 32);
10299  //  return AloBlo + AloBhi + AhiBlo;
10300
10301  SDValue A = Op.getOperand(0);
10302  SDValue B = Op.getOperand(1);
10303
10304  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
10305
10306  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
10307  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
10308
10309  // Bit cast to 32-bit vectors for MULUDQ
10310  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
10311  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
10312  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
10313  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
10314  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
10315
10316  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
10317  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
10318  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
10319
10320  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
10321  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
10322
10323  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
10324  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
10325}
10326
10327SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
10328
10329  EVT VT = Op.getValueType();
10330  DebugLoc dl = Op.getDebugLoc();
10331  SDValue R = Op.getOperand(0);
10332  SDValue Amt = Op.getOperand(1);
10333  LLVMContext *Context = DAG.getContext();
10334
10335  if (!Subtarget->hasSSE2())
10336    return SDValue();
10337
10338  // Optimize shl/srl/sra with constant shift amount.
10339  if (isSplatVector(Amt.getNode())) {
10340    SDValue SclrAmt = Amt->getOperand(0);
10341    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
10342      uint64_t ShiftAmt = C->getZExtValue();
10343
10344      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
10345          (Subtarget->hasAVX2() &&
10346           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
10347        if (Op.getOpcode() == ISD::SHL)
10348          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
10349                             DAG.getConstant(ShiftAmt, MVT::i32));
10350        if (Op.getOpcode() == ISD::SRL)
10351          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
10352                             DAG.getConstant(ShiftAmt, MVT::i32));
10353        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
10354          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
10355                             DAG.getConstant(ShiftAmt, MVT::i32));
10356      }
10357
10358      if (VT == MVT::v16i8) {
10359        if (Op.getOpcode() == ISD::SHL) {
10360          // Make a large shift.
10361          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
10362                                    DAG.getConstant(ShiftAmt, MVT::i32));
10363          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
10364          // Zero out the rightmost bits.
10365          SmallVector<SDValue, 16> V(16,
10366                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
10367                                                     MVT::i8));
10368          return DAG.getNode(ISD::AND, dl, VT, SHL,
10369                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
10370        }
10371        if (Op.getOpcode() == ISD::SRL) {
10372          // Make a large shift.
10373          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
10374                                    DAG.getConstant(ShiftAmt, MVT::i32));
10375          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
10376          // Zero out the leftmost bits.
10377          SmallVector<SDValue, 16> V(16,
10378                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
10379                                                     MVT::i8));
10380          return DAG.getNode(ISD::AND, dl, VT, SRL,
10381                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
10382        }
10383        if (Op.getOpcode() == ISD::SRA) {
10384          if (ShiftAmt == 7) {
10385            // R s>> 7  ===  R s< 0
10386            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
10387            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
10388          }
10389
10390          // R s>> a === ((R u>> a) ^ m) - m
10391          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
10392          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
10393                                                         MVT::i8));
10394          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
10395          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
10396          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
10397          return Res;
10398        }
10399        llvm_unreachable("Unknown shift opcode.");
10400      }
10401
10402      if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
10403        if (Op.getOpcode() == ISD::SHL) {
10404          // Make a large shift.
10405          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
10406                                    DAG.getConstant(ShiftAmt, MVT::i32));
10407          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
10408          // Zero out the rightmost bits.
10409          SmallVector<SDValue, 32> V(32,
10410                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
10411                                                     MVT::i8));
10412          return DAG.getNode(ISD::AND, dl, VT, SHL,
10413                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
10414        }
10415        if (Op.getOpcode() == ISD::SRL) {
10416          // Make a large shift.
10417          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
10418                                    DAG.getConstant(ShiftAmt, MVT::i32));
10419          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
10420          // Zero out the leftmost bits.
10421          SmallVector<SDValue, 32> V(32,
10422                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
10423                                                     MVT::i8));
10424          return DAG.getNode(ISD::AND, dl, VT, SRL,
10425                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
10426        }
10427        if (Op.getOpcode() == ISD::SRA) {
10428          if (ShiftAmt == 7) {
10429            // R s>> 7  ===  R s< 0
10430            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
10431            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
10432          }
10433
10434          // R s>> a === ((R u>> a) ^ m) - m
10435          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
10436          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
10437                                                         MVT::i8));
10438          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
10439          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
10440          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
10441          return Res;
10442        }
10443        llvm_unreachable("Unknown shift opcode.");
10444      }
10445    }
10446  }
10447
10448  // Lower SHL with variable shift amount.
10449  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
10450    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
10451                     DAG.getConstant(23, MVT::i32));
10452
10453    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
10454    Constant *C = ConstantDataVector::get(*Context, CV);
10455    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
10456    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
10457                                 MachinePointerInfo::getConstantPool(),
10458                                 false, false, false, 16);
10459
10460    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
10461    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
10462    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
10463    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
10464  }
10465  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
10466    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
10467
10468    // a = a << 5;
10469    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
10470                     DAG.getConstant(5, MVT::i32));
10471    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
10472
10473    // Turn 'a' into a mask suitable for VSELECT
10474    SDValue VSelM = DAG.getConstant(0x80, VT);
10475    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
10476    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
10477
10478    SDValue CM1 = DAG.getConstant(0x0f, VT);
10479    SDValue CM2 = DAG.getConstant(0x3f, VT);
10480
10481    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
10482    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
10483    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
10484                            DAG.getConstant(4, MVT::i32), DAG);
10485    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
10486    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
10487
10488    // a += a
10489    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
10490    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
10491    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
10492
10493    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
10494    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
10495    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
10496                            DAG.getConstant(2, MVT::i32), DAG);
10497    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
10498    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
10499
10500    // a += a
10501    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
10502    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
10503    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
10504
10505    // return VSELECT(r, r+r, a);
10506    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
10507                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
10508    return R;
10509  }
10510
10511  // Decompose 256-bit shifts into smaller 128-bit shifts.
10512  if (VT.getSizeInBits() == 256) {
10513    unsigned NumElems = VT.getVectorNumElements();
10514    MVT EltVT = VT.getVectorElementType().getSimpleVT();
10515    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10516
10517    // Extract the two vectors
10518    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
10519    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
10520
10521    // Recreate the shift amount vectors
10522    SDValue Amt1, Amt2;
10523    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
10524      // Constant shift amount
10525      SmallVector<SDValue, 4> Amt1Csts;
10526      SmallVector<SDValue, 4> Amt2Csts;
10527      for (unsigned i = 0; i != NumElems/2; ++i)
10528        Amt1Csts.push_back(Amt->getOperand(i));
10529      for (unsigned i = NumElems/2; i != NumElems; ++i)
10530        Amt2Csts.push_back(Amt->getOperand(i));
10531
10532      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
10533                                 &Amt1Csts[0], NumElems/2);
10534      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
10535                                 &Amt2Csts[0], NumElems/2);
10536    } else {
10537      // Variable shift amount
10538      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
10539      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
10540    }
10541
10542    // Issue new vector shifts for the smaller types
10543    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
10544    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
10545
10546    // Concatenate the result back
10547    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
10548  }
10549
10550  return SDValue();
10551}
10552
10553SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
10554  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
10555  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
10556  // looks for this combo and may remove the "setcc" instruction if the "setcc"
10557  // has only one use.
10558  SDNode *N = Op.getNode();
10559  SDValue LHS = N->getOperand(0);
10560  SDValue RHS = N->getOperand(1);
10561  unsigned BaseOp = 0;
10562  unsigned Cond = 0;
10563  DebugLoc DL = Op.getDebugLoc();
10564  switch (Op.getOpcode()) {
10565  default: llvm_unreachable("Unknown ovf instruction!");
10566  case ISD::SADDO:
10567    // A subtract of one will be selected as a INC. Note that INC doesn't
10568    // set CF, so we can't do this for UADDO.
10569    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
10570      if (C->isOne()) {
10571        BaseOp = X86ISD::INC;
10572        Cond = X86::COND_O;
10573        break;
10574      }
10575    BaseOp = X86ISD::ADD;
10576    Cond = X86::COND_O;
10577    break;
10578  case ISD::UADDO:
10579    BaseOp = X86ISD::ADD;
10580    Cond = X86::COND_B;
10581    break;
10582  case ISD::SSUBO:
10583    // A subtract of one will be selected as a DEC. Note that DEC doesn't
10584    // set CF, so we can't do this for USUBO.
10585    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
10586      if (C->isOne()) {
10587        BaseOp = X86ISD::DEC;
10588        Cond = X86::COND_O;
10589        break;
10590      }
10591    BaseOp = X86ISD::SUB;
10592    Cond = X86::COND_O;
10593    break;
10594  case ISD::USUBO:
10595    BaseOp = X86ISD::SUB;
10596    Cond = X86::COND_B;
10597    break;
10598  case ISD::SMULO:
10599    BaseOp = X86ISD::SMUL;
10600    Cond = X86::COND_O;
10601    break;
10602  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
10603    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
10604                                 MVT::i32);
10605    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
10606
10607    SDValue SetCC =
10608      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
10609                  DAG.getConstant(X86::COND_O, MVT::i32),
10610                  SDValue(Sum.getNode(), 2));
10611
10612    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
10613  }
10614  }
10615
10616  // Also sets EFLAGS.
10617  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
10618  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
10619
10620  SDValue SetCC =
10621    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
10622                DAG.getConstant(Cond, MVT::i32),
10623                SDValue(Sum.getNode(), 1));
10624
10625  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
10626}
10627
10628SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
10629                                                  SelectionDAG &DAG) const {
10630  DebugLoc dl = Op.getDebugLoc();
10631  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
10632  EVT VT = Op.getValueType();
10633
10634  if (!Subtarget->hasSSE2() || !VT.isVector())
10635    return SDValue();
10636
10637  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
10638                      ExtraVT.getScalarType().getSizeInBits();
10639  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
10640
10641  switch (VT.getSimpleVT().SimpleTy) {
10642    default: return SDValue();
10643    case MVT::v8i32:
10644    case MVT::v16i16:
10645      if (!Subtarget->hasAVX())
10646        return SDValue();
10647      if (!Subtarget->hasAVX2()) {
10648        // needs to be split
10649        unsigned NumElems = VT.getVectorNumElements();
10650
10651        // Extract the LHS vectors
10652        SDValue LHS = Op.getOperand(0);
10653        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
10654        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
10655
10656        MVT EltVT = VT.getVectorElementType().getSimpleVT();
10657        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10658
10659        EVT ExtraEltVT = ExtraVT.getVectorElementType();
10660        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
10661        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
10662                                   ExtraNumElems/2);
10663        SDValue Extra = DAG.getValueType(ExtraVT);
10664
10665        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
10666        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
10667
10668        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
10669      }
10670      // fall through
10671    case MVT::v4i32:
10672    case MVT::v8i16: {
10673      SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
10674                                         Op.getOperand(0), ShAmt, DAG);
10675      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
10676    }
10677  }
10678}
10679
10680
10681SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
10682  DebugLoc dl = Op.getDebugLoc();
10683
10684  // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
10685  // There isn't any reason to disable it if the target processor supports it.
10686  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
10687    SDValue Chain = Op.getOperand(0);
10688    SDValue Zero = DAG.getConstant(0, MVT::i32);
10689    SDValue Ops[] = {
10690      DAG.getRegister(X86::ESP, MVT::i32), // Base
10691      DAG.getTargetConstant(1, MVT::i8),   // Scale
10692      DAG.getRegister(0, MVT::i32),        // Index
10693      DAG.getTargetConstant(0, MVT::i32),  // Disp
10694      DAG.getRegister(0, MVT::i32),        // Segment.
10695      Zero,
10696      Chain
10697    };
10698    SDNode *Res =
10699      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
10700                          array_lengthof(Ops));
10701    return SDValue(Res, 0);
10702  }
10703
10704  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
10705  if (!isDev)
10706    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
10707
10708  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10709  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
10710  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
10711  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
10712
10713  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
10714  if (!Op1 && !Op2 && !Op3 && Op4)
10715    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
10716
10717  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
10718  if (Op1 && !Op2 && !Op3 && !Op4)
10719    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
10720
10721  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
10722  //           (MFENCE)>;
10723  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
10724}
10725
10726SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
10727                                             SelectionDAG &DAG) const {
10728  DebugLoc dl = Op.getDebugLoc();
10729  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
10730    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
10731  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
10732    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
10733
10734  // The only fence that needs an instruction is a sequentially-consistent
10735  // cross-thread fence.
10736  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
10737    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
10738    // no-sse2). There isn't any reason to disable it if the target processor
10739    // supports it.
10740    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
10741      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
10742
10743    SDValue Chain = Op.getOperand(0);
10744    SDValue Zero = DAG.getConstant(0, MVT::i32);
10745    SDValue Ops[] = {
10746      DAG.getRegister(X86::ESP, MVT::i32), // Base
10747      DAG.getTargetConstant(1, MVT::i8),   // Scale
10748      DAG.getRegister(0, MVT::i32),        // Index
10749      DAG.getTargetConstant(0, MVT::i32),  // Disp
10750      DAG.getRegister(0, MVT::i32),        // Segment.
10751      Zero,
10752      Chain
10753    };
10754    SDNode *Res =
10755      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
10756                         array_lengthof(Ops));
10757    return SDValue(Res, 0);
10758  }
10759
10760  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
10761  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
10762}
10763
10764
10765SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
10766  EVT T = Op.getValueType();
10767  DebugLoc DL = Op.getDebugLoc();
10768  unsigned Reg = 0;
10769  unsigned size = 0;
10770  switch(T.getSimpleVT().SimpleTy) {
10771  default: llvm_unreachable("Invalid value type!");
10772  case MVT::i8:  Reg = X86::AL;  size = 1; break;
10773  case MVT::i16: Reg = X86::AX;  size = 2; break;
10774  case MVT::i32: Reg = X86::EAX; size = 4; break;
10775  case MVT::i64:
10776    assert(Subtarget->is64Bit() && "Node not type legal!");
10777    Reg = X86::RAX; size = 8;
10778    break;
10779  }
10780  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
10781                                    Op.getOperand(2), SDValue());
10782  SDValue Ops[] = { cpIn.getValue(0),
10783                    Op.getOperand(1),
10784                    Op.getOperand(3),
10785                    DAG.getTargetConstant(size, MVT::i8),
10786                    cpIn.getValue(1) };
10787  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
10788  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
10789  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
10790                                           Ops, 5, T, MMO);
10791  SDValue cpOut =
10792    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
10793  return cpOut;
10794}
10795
10796SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
10797                                                 SelectionDAG &DAG) const {
10798  assert(Subtarget->is64Bit() && "Result not type legalized?");
10799  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
10800  SDValue TheChain = Op.getOperand(0);
10801  DebugLoc dl = Op.getDebugLoc();
10802  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
10803  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
10804  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
10805                                   rax.getValue(2));
10806  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
10807                            DAG.getConstant(32, MVT::i8));
10808  SDValue Ops[] = {
10809    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
10810    rdx.getValue(1)
10811  };
10812  return DAG.getMergeValues(Ops, 2, dl);
10813}
10814
10815SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
10816                                            SelectionDAG &DAG) const {
10817  EVT SrcVT = Op.getOperand(0).getValueType();
10818  EVT DstVT = Op.getValueType();
10819  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
10820         Subtarget->hasMMX() && "Unexpected custom BITCAST");
10821  assert((DstVT == MVT::i64 ||
10822          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
10823         "Unexpected custom BITCAST");
10824  // i64 <=> MMX conversions are Legal.
10825  if (SrcVT==MVT::i64 && DstVT.isVector())
10826    return Op;
10827  if (DstVT==MVT::i64 && SrcVT.isVector())
10828    return Op;
10829  // MMX <=> MMX conversions are Legal.
10830  if (SrcVT.isVector() && DstVT.isVector())
10831    return Op;
10832  // All other conversions need to be expanded.
10833  return SDValue();
10834}
10835
10836SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
10837  SDNode *Node = Op.getNode();
10838  DebugLoc dl = Node->getDebugLoc();
10839  EVT T = Node->getValueType(0);
10840  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
10841                              DAG.getConstant(0, T), Node->getOperand(2));
10842  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
10843                       cast<AtomicSDNode>(Node)->getMemoryVT(),
10844                       Node->getOperand(0),
10845                       Node->getOperand(1), negOp,
10846                       cast<AtomicSDNode>(Node)->getSrcValue(),
10847                       cast<AtomicSDNode>(Node)->getAlignment(),
10848                       cast<AtomicSDNode>(Node)->getOrdering(),
10849                       cast<AtomicSDNode>(Node)->getSynchScope());
10850}
10851
10852static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
10853  SDNode *Node = Op.getNode();
10854  DebugLoc dl = Node->getDebugLoc();
10855  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
10856
10857  // Convert seq_cst store -> xchg
10858  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
10859  // FIXME: On 32-bit, store -> fist or movq would be more efficient
10860  //        (The only way to get a 16-byte store is cmpxchg16b)
10861  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
10862  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
10863      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
10864    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
10865                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
10866                                 Node->getOperand(0),
10867                                 Node->getOperand(1), Node->getOperand(2),
10868                                 cast<AtomicSDNode>(Node)->getMemOperand(),
10869                                 cast<AtomicSDNode>(Node)->getOrdering(),
10870                                 cast<AtomicSDNode>(Node)->getSynchScope());
10871    return Swap.getValue(1);
10872  }
10873  // Other atomic stores have a simple pattern.
10874  return Op;
10875}
10876
10877static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
10878  EVT VT = Op.getNode()->getValueType(0);
10879
10880  // Let legalize expand this if it isn't a legal type yet.
10881  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
10882    return SDValue();
10883
10884  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
10885
10886  unsigned Opc;
10887  bool ExtraOp = false;
10888  switch (Op.getOpcode()) {
10889  default: llvm_unreachable("Invalid code");
10890  case ISD::ADDC: Opc = X86ISD::ADD; break;
10891  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
10892  case ISD::SUBC: Opc = X86ISD::SUB; break;
10893  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
10894  }
10895
10896  if (!ExtraOp)
10897    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
10898                       Op.getOperand(1));
10899  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
10900                     Op.getOperand(1), Op.getOperand(2));
10901}
10902
10903/// LowerOperation - Provide custom lowering hooks for some operations.
10904///
10905SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
10906  switch (Op.getOpcode()) {
10907  default: llvm_unreachable("Should not custom lower this!");
10908  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
10909  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
10910  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
10911  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
10912  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
10913  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
10914  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
10915  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
10916  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
10917  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
10918  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
10919  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op, DAG);
10920  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, DAG);
10921  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
10922  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
10923  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
10924  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
10925  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
10926  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
10927  case ISD::SHL_PARTS:
10928  case ISD::SRA_PARTS:
10929  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
10930  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
10931  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
10932  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
10933  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
10934  case ISD::FABS:               return LowerFABS(Op, DAG);
10935  case ISD::FNEG:               return LowerFNEG(Op, DAG);
10936  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
10937  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
10938  case ISD::SETCC:              return LowerSETCC(Op, DAG);
10939  case ISD::SELECT:             return LowerSELECT(Op, DAG);
10940  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
10941  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
10942  case ISD::VASTART:            return LowerVASTART(Op, DAG);
10943  case ISD::VAARG:              return LowerVAARG(Op, DAG);
10944  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
10945  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
10946  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
10947  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
10948  case ISD::FRAME_TO_ARGS_OFFSET:
10949                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
10950  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
10951  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
10952  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
10953  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
10954  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
10955  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
10956  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
10957  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
10958  case ISD::MUL:                return LowerMUL(Op, DAG);
10959  case ISD::SRA:
10960  case ISD::SRL:
10961  case ISD::SHL:                return LowerShift(Op, DAG);
10962  case ISD::SADDO:
10963  case ISD::UADDO:
10964  case ISD::SSUBO:
10965  case ISD::USUBO:
10966  case ISD::SMULO:
10967  case ISD::UMULO:              return LowerXALUO(Op, DAG);
10968  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
10969  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
10970  case ISD::ADDC:
10971  case ISD::ADDE:
10972  case ISD::SUBC:
10973  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
10974  case ISD::ADD:                return LowerADD(Op, DAG);
10975  case ISD::SUB:                return LowerSUB(Op, DAG);
10976  }
10977}
10978
10979static void ReplaceATOMIC_LOAD(SDNode *Node,
10980                                  SmallVectorImpl<SDValue> &Results,
10981                                  SelectionDAG &DAG) {
10982  DebugLoc dl = Node->getDebugLoc();
10983  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
10984
10985  // Convert wide load -> cmpxchg8b/cmpxchg16b
10986  // FIXME: On 32-bit, load -> fild or movq would be more efficient
10987  //        (The only way to get a 16-byte load is cmpxchg16b)
10988  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
10989  SDValue Zero = DAG.getConstant(0, VT);
10990  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
10991                               Node->getOperand(0),
10992                               Node->getOperand(1), Zero, Zero,
10993                               cast<AtomicSDNode>(Node)->getMemOperand(),
10994                               cast<AtomicSDNode>(Node)->getOrdering(),
10995                               cast<AtomicSDNode>(Node)->getSynchScope());
10996  Results.push_back(Swap.getValue(0));
10997  Results.push_back(Swap.getValue(1));
10998}
10999
11000void X86TargetLowering::
11001ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
11002                        SelectionDAG &DAG, unsigned NewOp) const {
11003  DebugLoc dl = Node->getDebugLoc();
11004  assert (Node->getValueType(0) == MVT::i64 &&
11005          "Only know how to expand i64 atomics");
11006
11007  SDValue Chain = Node->getOperand(0);
11008  SDValue In1 = Node->getOperand(1);
11009  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11010                             Node->getOperand(2), DAG.getIntPtrConstant(0));
11011  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11012                             Node->getOperand(2), DAG.getIntPtrConstant(1));
11013  SDValue Ops[] = { Chain, In1, In2L, In2H };
11014  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
11015  SDValue Result =
11016    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
11017                            cast<MemSDNode>(Node)->getMemOperand());
11018  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
11019  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
11020  Results.push_back(Result.getValue(2));
11021}
11022
11023/// ReplaceNodeResults - Replace a node with an illegal result type
11024/// with a new node built out of custom code.
11025void X86TargetLowering::ReplaceNodeResults(SDNode *N,
11026                                           SmallVectorImpl<SDValue>&Results,
11027                                           SelectionDAG &DAG) const {
11028  DebugLoc dl = N->getDebugLoc();
11029  switch (N->getOpcode()) {
11030  default:
11031    llvm_unreachable("Do not know how to custom type legalize this operation!");
11032  case ISD::SIGN_EXTEND_INREG:
11033  case ISD::ADDC:
11034  case ISD::ADDE:
11035  case ISD::SUBC:
11036  case ISD::SUBE:
11037    // We don't want to expand or promote these.
11038    return;
11039  case ISD::FP_TO_SINT:
11040  case ISD::FP_TO_UINT: {
11041    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
11042
11043    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
11044      return;
11045
11046    std::pair<SDValue,SDValue> Vals =
11047        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
11048    SDValue FIST = Vals.first, StackSlot = Vals.second;
11049    if (FIST.getNode() != 0) {
11050      EVT VT = N->getValueType(0);
11051      // Return a load from the stack slot.
11052      if (StackSlot.getNode() != 0)
11053        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
11054                                      MachinePointerInfo(),
11055                                      false, false, false, 0));
11056      else
11057        Results.push_back(FIST);
11058    }
11059    return;
11060  }
11061  case ISD::READCYCLECOUNTER: {
11062    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11063    SDValue TheChain = N->getOperand(0);
11064    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11065    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
11066                                     rd.getValue(1));
11067    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
11068                                     eax.getValue(2));
11069    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
11070    SDValue Ops[] = { eax, edx };
11071    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
11072    Results.push_back(edx.getValue(1));
11073    return;
11074  }
11075  case ISD::ATOMIC_CMP_SWAP: {
11076    EVT T = N->getValueType(0);
11077    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
11078    bool Regs64bit = T == MVT::i128;
11079    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
11080    SDValue cpInL, cpInH;
11081    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11082                        DAG.getConstant(0, HalfT));
11083    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11084                        DAG.getConstant(1, HalfT));
11085    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
11086                             Regs64bit ? X86::RAX : X86::EAX,
11087                             cpInL, SDValue());
11088    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
11089                             Regs64bit ? X86::RDX : X86::EDX,
11090                             cpInH, cpInL.getValue(1));
11091    SDValue swapInL, swapInH;
11092    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11093                          DAG.getConstant(0, HalfT));
11094    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11095                          DAG.getConstant(1, HalfT));
11096    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
11097                               Regs64bit ? X86::RBX : X86::EBX,
11098                               swapInL, cpInH.getValue(1));
11099    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
11100                               Regs64bit ? X86::RCX : X86::ECX,
11101                               swapInH, swapInL.getValue(1));
11102    SDValue Ops[] = { swapInH.getValue(0),
11103                      N->getOperand(1),
11104                      swapInH.getValue(1) };
11105    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11106    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
11107    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
11108                                  X86ISD::LCMPXCHG8_DAG;
11109    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
11110                                             Ops, 3, T, MMO);
11111    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
11112                                        Regs64bit ? X86::RAX : X86::EAX,
11113                                        HalfT, Result.getValue(1));
11114    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
11115                                        Regs64bit ? X86::RDX : X86::EDX,
11116                                        HalfT, cpOutL.getValue(2));
11117    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
11118    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
11119    Results.push_back(cpOutH.getValue(1));
11120    return;
11121  }
11122  case ISD::ATOMIC_LOAD_ADD:
11123    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
11124    return;
11125  case ISD::ATOMIC_LOAD_AND:
11126    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
11127    return;
11128  case ISD::ATOMIC_LOAD_NAND:
11129    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
11130    return;
11131  case ISD::ATOMIC_LOAD_OR:
11132    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
11133    return;
11134  case ISD::ATOMIC_LOAD_SUB:
11135    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
11136    return;
11137  case ISD::ATOMIC_LOAD_XOR:
11138    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
11139    return;
11140  case ISD::ATOMIC_SWAP:
11141    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
11142    return;
11143  case ISD::ATOMIC_LOAD:
11144    ReplaceATOMIC_LOAD(N, Results, DAG);
11145  }
11146}
11147
11148const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
11149  switch (Opcode) {
11150  default: return NULL;
11151  case X86ISD::BSF:                return "X86ISD::BSF";
11152  case X86ISD::BSR:                return "X86ISD::BSR";
11153  case X86ISD::SHLD:               return "X86ISD::SHLD";
11154  case X86ISD::SHRD:               return "X86ISD::SHRD";
11155  case X86ISD::FAND:               return "X86ISD::FAND";
11156  case X86ISD::FOR:                return "X86ISD::FOR";
11157  case X86ISD::FXOR:               return "X86ISD::FXOR";
11158  case X86ISD::FSRL:               return "X86ISD::FSRL";
11159  case X86ISD::FILD:               return "X86ISD::FILD";
11160  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
11161  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
11162  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
11163  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
11164  case X86ISD::FLD:                return "X86ISD::FLD";
11165  case X86ISD::FST:                return "X86ISD::FST";
11166  case X86ISD::CALL:               return "X86ISD::CALL";
11167  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
11168  case X86ISD::BT:                 return "X86ISD::BT";
11169  case X86ISD::CMP:                return "X86ISD::CMP";
11170  case X86ISD::COMI:               return "X86ISD::COMI";
11171  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
11172  case X86ISD::SETCC:              return "X86ISD::SETCC";
11173  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
11174  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
11175  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
11176  case X86ISD::CMOV:               return "X86ISD::CMOV";
11177  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
11178  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
11179  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
11180  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
11181  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
11182  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
11183  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
11184  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
11185  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
11186  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
11187  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
11188  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
11189  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
11190  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
11191  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
11192  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
11193  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
11194  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
11195  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
11196  case X86ISD::HADD:               return "X86ISD::HADD";
11197  case X86ISD::HSUB:               return "X86ISD::HSUB";
11198  case X86ISD::FHADD:              return "X86ISD::FHADD";
11199  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
11200  case X86ISD::FMAX:               return "X86ISD::FMAX";
11201  case X86ISD::FMIN:               return "X86ISD::FMIN";
11202  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
11203  case X86ISD::FRCP:               return "X86ISD::FRCP";
11204  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
11205  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
11206  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
11207  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
11208  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
11209  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
11210  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
11211  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
11212  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
11213  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
11214  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
11215  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
11216  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
11217  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
11218  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
11219  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
11220  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
11221  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
11222  case X86ISD::VSHL:               return "X86ISD::VSHL";
11223  case X86ISD::VSRL:               return "X86ISD::VSRL";
11224  case X86ISD::VSRA:               return "X86ISD::VSRA";
11225  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
11226  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
11227  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
11228  case X86ISD::CMPP:               return "X86ISD::CMPP";
11229  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
11230  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
11231  case X86ISD::ADD:                return "X86ISD::ADD";
11232  case X86ISD::SUB:                return "X86ISD::SUB";
11233  case X86ISD::ADC:                return "X86ISD::ADC";
11234  case X86ISD::SBB:                return "X86ISD::SBB";
11235  case X86ISD::SMUL:               return "X86ISD::SMUL";
11236  case X86ISD::UMUL:               return "X86ISD::UMUL";
11237  case X86ISD::INC:                return "X86ISD::INC";
11238  case X86ISD::DEC:                return "X86ISD::DEC";
11239  case X86ISD::OR:                 return "X86ISD::OR";
11240  case X86ISD::XOR:                return "X86ISD::XOR";
11241  case X86ISD::AND:                return "X86ISD::AND";
11242  case X86ISD::ANDN:               return "X86ISD::ANDN";
11243  case X86ISD::BLSI:               return "X86ISD::BLSI";
11244  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
11245  case X86ISD::BLSR:               return "X86ISD::BLSR";
11246  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
11247  case X86ISD::PTEST:              return "X86ISD::PTEST";
11248  case X86ISD::TESTP:              return "X86ISD::TESTP";
11249  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
11250  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
11251  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
11252  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
11253  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
11254  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
11255  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
11256  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
11257  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
11258  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
11259  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
11260  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
11261  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
11262  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
11263  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
11264  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
11265  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
11266  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
11267  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
11268  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
11269  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
11270  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
11271  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
11272  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
11273  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
11274  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
11275  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
11276  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
11277  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
11278  case X86ISD::SAHF:               return "X86ISD::SAHF";
11279  }
11280}
11281
11282// isLegalAddressingMode - Return true if the addressing mode represented
11283// by AM is legal for this target, for a load/store of the specified type.
11284bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
11285                                              Type *Ty) const {
11286  // X86 supports extremely general addressing modes.
11287  CodeModel::Model M = getTargetMachine().getCodeModel();
11288  Reloc::Model R = getTargetMachine().getRelocationModel();
11289
11290  // X86 allows a sign-extended 32-bit immediate field as a displacement.
11291  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
11292    return false;
11293
11294  if (AM.BaseGV) {
11295    unsigned GVFlags =
11296      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
11297
11298    // If a reference to this global requires an extra load, we can't fold it.
11299    if (isGlobalStubReference(GVFlags))
11300      return false;
11301
11302    // If BaseGV requires a register for the PIC base, we cannot also have a
11303    // BaseReg specified.
11304    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
11305      return false;
11306
11307    // If lower 4G is not available, then we must use rip-relative addressing.
11308    if ((M != CodeModel::Small || R != Reloc::Static) &&
11309        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
11310      return false;
11311  }
11312
11313  switch (AM.Scale) {
11314  case 0:
11315  case 1:
11316  case 2:
11317  case 4:
11318  case 8:
11319    // These scales always work.
11320    break;
11321  case 3:
11322  case 5:
11323  case 9:
11324    // These scales are formed with basereg+scalereg.  Only accept if there is
11325    // no basereg yet.
11326    if (AM.HasBaseReg)
11327      return false;
11328    break;
11329  default:  // Other stuff never works.
11330    return false;
11331  }
11332
11333  return true;
11334}
11335
11336
11337bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
11338  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11339    return false;
11340  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
11341  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
11342  if (NumBits1 <= NumBits2)
11343    return false;
11344  return true;
11345}
11346
11347bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
11348  if (!VT1.isInteger() || !VT2.isInteger())
11349    return false;
11350  unsigned NumBits1 = VT1.getSizeInBits();
11351  unsigned NumBits2 = VT2.getSizeInBits();
11352  if (NumBits1 <= NumBits2)
11353    return false;
11354  return true;
11355}
11356
11357bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
11358  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
11359  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
11360}
11361
11362bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
11363  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
11364  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
11365}
11366
11367bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
11368  // i16 instructions are longer (0x66 prefix) and potentially slower.
11369  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
11370}
11371
11372/// isShuffleMaskLegal - Targets can use this to indicate that they only
11373/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
11374/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
11375/// are assumed to be legal.
11376bool
11377X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
11378                                      EVT VT) const {
11379  // Very little shuffling can be done for 64-bit vectors right now.
11380  if (VT.getSizeInBits() == 64)
11381    return false;
11382
11383  // FIXME: pshufb, blends, shifts.
11384  return (VT.getVectorNumElements() == 2 ||
11385          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
11386          isMOVLMask(M, VT) ||
11387          isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
11388          isPSHUFDMask(M, VT) ||
11389          isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
11390          isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
11391          isPALIGNRMask(M, VT, Subtarget) ||
11392          isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
11393          isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
11394          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
11395          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
11396}
11397
11398bool
11399X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
11400                                          EVT VT) const {
11401  unsigned NumElts = VT.getVectorNumElements();
11402  // FIXME: This collection of masks seems suspect.
11403  if (NumElts == 2)
11404    return true;
11405  if (NumElts == 4 && VT.getSizeInBits() == 128) {
11406    return (isMOVLMask(Mask, VT)  ||
11407            isCommutedMOVLMask(Mask, VT, true) ||
11408            isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
11409            isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
11410  }
11411  return false;
11412}
11413
11414//===----------------------------------------------------------------------===//
11415//                           X86 Scheduler Hooks
11416//===----------------------------------------------------------------------===//
11417
11418// private utility function
11419MachineBasicBlock *
11420X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
11421                                                       MachineBasicBlock *MBB,
11422                                                       unsigned regOpc,
11423                                                       unsigned immOpc,
11424                                                       unsigned LoadOpc,
11425                                                       unsigned CXchgOpc,
11426                                                       unsigned notOpc,
11427                                                       unsigned EAXreg,
11428                                                 const TargetRegisterClass *RC,
11429                                                       bool Invert) const {
11430  // For the atomic bitwise operator, we generate
11431  //   thisMBB:
11432  //   newMBB:
11433  //     ld  t1 = [bitinstr.addr]
11434  //     op  t2 = t1, [bitinstr.val]
11435  //     not t3 = t2  (if Invert)
11436  //     mov EAX = t1
11437  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
11438  //     bz  newMBB
11439  //     fallthrough -->nextMBB
11440  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11441  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11442  MachineFunction::iterator MBBIter = MBB;
11443  ++MBBIter;
11444
11445  /// First build the CFG
11446  MachineFunction *F = MBB->getParent();
11447  MachineBasicBlock *thisMBB = MBB;
11448  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
11449  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
11450  F->insert(MBBIter, newMBB);
11451  F->insert(MBBIter, nextMBB);
11452
11453  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
11454  nextMBB->splice(nextMBB->begin(), thisMBB,
11455                  llvm::next(MachineBasicBlock::iterator(bInstr)),
11456                  thisMBB->end());
11457  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
11458
11459  // Update thisMBB to fall through to newMBB
11460  thisMBB->addSuccessor(newMBB);
11461
11462  // newMBB jumps to itself and fall through to nextMBB
11463  newMBB->addSuccessor(nextMBB);
11464  newMBB->addSuccessor(newMBB);
11465
11466  // Insert instructions into newMBB based on incoming instruction
11467  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
11468         "unexpected number of operands");
11469  DebugLoc dl = bInstr->getDebugLoc();
11470  MachineOperand& destOper = bInstr->getOperand(0);
11471  MachineOperand* argOpers[2 + X86::AddrNumOperands];
11472  int numArgs = bInstr->getNumOperands() - 1;
11473  for (int i=0; i < numArgs; ++i)
11474    argOpers[i] = &bInstr->getOperand(i+1);
11475
11476  // x86 address has 4 operands: base, index, scale, and displacement
11477  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
11478  int valArgIndx = lastAddrIndx + 1;
11479
11480  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
11481  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
11482  for (int i=0; i <= lastAddrIndx; ++i)
11483    (*MIB).addOperand(*argOpers[i]);
11484
11485  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
11486  assert((argOpers[valArgIndx]->isReg() ||
11487          argOpers[valArgIndx]->isImm()) &&
11488         "invalid operand");
11489  if (argOpers[valArgIndx]->isReg())
11490    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
11491  else
11492    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
11493  MIB.addReg(t1);
11494  (*MIB).addOperand(*argOpers[valArgIndx]);
11495
11496  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
11497  if (Invert) {
11498    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
11499  }
11500  else
11501    t3 = t2;
11502
11503  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
11504  MIB.addReg(t1);
11505
11506  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
11507  for (int i=0; i <= lastAddrIndx; ++i)
11508    (*MIB).addOperand(*argOpers[i]);
11509  MIB.addReg(t3);
11510  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
11511  (*MIB).setMemRefs(bInstr->memoperands_begin(),
11512                    bInstr->memoperands_end());
11513
11514  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
11515  MIB.addReg(EAXreg);
11516
11517  // insert branch
11518  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
11519
11520  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
11521  return nextMBB;
11522}
11523
11524// private utility function:  64 bit atomics on 32 bit host.
11525MachineBasicBlock *
11526X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
11527                                                       MachineBasicBlock *MBB,
11528                                                       unsigned regOpcL,
11529                                                       unsigned regOpcH,
11530                                                       unsigned immOpcL,
11531                                                       unsigned immOpcH,
11532                                                       bool Invert) const {
11533  // For the atomic bitwise operator, we generate
11534  //   thisMBB (instructions are in pairs, except cmpxchg8b)
11535  //     ld t1,t2 = [bitinstr.addr]
11536  //   newMBB:
11537  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
11538  //     op  t5, t6 <- out1, out2, [bitinstr.val]
11539  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
11540  //     neg t7, t8 < t5, t6  (if Invert)
11541  //     mov ECX, EBX <- t5, t6
11542  //     mov EAX, EDX <- t1, t2
11543  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
11544  //     mov t3, t4 <- EAX, EDX
11545  //     bz  newMBB
11546  //     result in out1, out2
11547  //     fallthrough -->nextMBB
11548
11549  const TargetRegisterClass *RC = &X86::GR32RegClass;
11550  const unsigned LoadOpc = X86::MOV32rm;
11551  const unsigned NotOpc = X86::NOT32r;
11552  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11553  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11554  MachineFunction::iterator MBBIter = MBB;
11555  ++MBBIter;
11556
11557  /// First build the CFG
11558  MachineFunction *F = MBB->getParent();
11559  MachineBasicBlock *thisMBB = MBB;
11560  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
11561  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
11562  F->insert(MBBIter, newMBB);
11563  F->insert(MBBIter, nextMBB);
11564
11565  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
11566  nextMBB->splice(nextMBB->begin(), thisMBB,
11567                  llvm::next(MachineBasicBlock::iterator(bInstr)),
11568                  thisMBB->end());
11569  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
11570
11571  // Update thisMBB to fall through to newMBB
11572  thisMBB->addSuccessor(newMBB);
11573
11574  // newMBB jumps to itself and fall through to nextMBB
11575  newMBB->addSuccessor(nextMBB);
11576  newMBB->addSuccessor(newMBB);
11577
11578  DebugLoc dl = bInstr->getDebugLoc();
11579  // Insert instructions into newMBB based on incoming instruction
11580  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
11581  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
11582         "unexpected number of operands");
11583  MachineOperand& dest1Oper = bInstr->getOperand(0);
11584  MachineOperand& dest2Oper = bInstr->getOperand(1);
11585  MachineOperand* argOpers[2 + X86::AddrNumOperands];
11586  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
11587    argOpers[i] = &bInstr->getOperand(i+2);
11588
11589    // We use some of the operands multiple times, so conservatively just
11590    // clear any kill flags that might be present.
11591    if (argOpers[i]->isReg() && argOpers[i]->isUse())
11592      argOpers[i]->setIsKill(false);
11593  }
11594
11595  // x86 address has 5 operands: base, index, scale, displacement, and segment.
11596  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
11597
11598  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
11599  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
11600  for (int i=0; i <= lastAddrIndx; ++i)
11601    (*MIB).addOperand(*argOpers[i]);
11602  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
11603  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
11604  // add 4 to displacement.
11605  for (int i=0; i <= lastAddrIndx-2; ++i)
11606    (*MIB).addOperand(*argOpers[i]);
11607  MachineOperand newOp3 = *(argOpers[3]);
11608  if (newOp3.isImm())
11609    newOp3.setImm(newOp3.getImm()+4);
11610  else
11611    newOp3.setOffset(newOp3.getOffset()+4);
11612  (*MIB).addOperand(newOp3);
11613  (*MIB).addOperand(*argOpers[lastAddrIndx]);
11614
11615  // t3/4 are defined later, at the bottom of the loop
11616  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
11617  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
11618  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
11619    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
11620  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
11621    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
11622
11623  // The subsequent operations should be using the destination registers of
11624  // the PHI instructions.
11625  t1 = dest1Oper.getReg();
11626  t2 = dest2Oper.getReg();
11627
11628  int valArgIndx = lastAddrIndx + 1;
11629  assert((argOpers[valArgIndx]->isReg() ||
11630          argOpers[valArgIndx]->isImm()) &&
11631         "invalid operand");
11632  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
11633  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
11634  if (argOpers[valArgIndx]->isReg())
11635    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
11636  else
11637    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
11638  if (regOpcL != X86::MOV32rr)
11639    MIB.addReg(t1);
11640  (*MIB).addOperand(*argOpers[valArgIndx]);
11641  assert(argOpers[valArgIndx + 1]->isReg() ==
11642         argOpers[valArgIndx]->isReg());
11643  assert(argOpers[valArgIndx + 1]->isImm() ==
11644         argOpers[valArgIndx]->isImm());
11645  if (argOpers[valArgIndx + 1]->isReg())
11646    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
11647  else
11648    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
11649  if (regOpcH != X86::MOV32rr)
11650    MIB.addReg(t2);
11651  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
11652
11653  unsigned t7, t8;
11654  if (Invert) {
11655    t7 = F->getRegInfo().createVirtualRegister(RC);
11656    t8 = F->getRegInfo().createVirtualRegister(RC);
11657    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
11658    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
11659  } else {
11660    t7 = t5;
11661    t8 = t6;
11662  }
11663
11664  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
11665  MIB.addReg(t1);
11666  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
11667  MIB.addReg(t2);
11668
11669  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
11670  MIB.addReg(t7);
11671  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
11672  MIB.addReg(t8);
11673
11674  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
11675  for (int i=0; i <= lastAddrIndx; ++i)
11676    (*MIB).addOperand(*argOpers[i]);
11677
11678  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
11679  (*MIB).setMemRefs(bInstr->memoperands_begin(),
11680                    bInstr->memoperands_end());
11681
11682  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
11683  MIB.addReg(X86::EAX);
11684  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
11685  MIB.addReg(X86::EDX);
11686
11687  // insert branch
11688  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
11689
11690  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
11691  return nextMBB;
11692}
11693
11694// private utility function
11695MachineBasicBlock *
11696X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
11697                                                      MachineBasicBlock *MBB,
11698                                                      unsigned cmovOpc) const {
11699  // For the atomic min/max operator, we generate
11700  //   thisMBB:
11701  //   newMBB:
11702  //     ld t1 = [min/max.addr]
11703  //     mov t2 = [min/max.val]
11704  //     cmp  t1, t2
11705  //     cmov[cond] t2 = t1
11706  //     mov EAX = t1
11707  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
11708  //     bz   newMBB
11709  //     fallthrough -->nextMBB
11710  //
11711  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11712  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11713  MachineFunction::iterator MBBIter = MBB;
11714  ++MBBIter;
11715
11716  /// First build the CFG
11717  MachineFunction *F = MBB->getParent();
11718  MachineBasicBlock *thisMBB = MBB;
11719  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
11720  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
11721  F->insert(MBBIter, newMBB);
11722  F->insert(MBBIter, nextMBB);
11723
11724  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
11725  nextMBB->splice(nextMBB->begin(), thisMBB,
11726                  llvm::next(MachineBasicBlock::iterator(mInstr)),
11727                  thisMBB->end());
11728  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
11729
11730  // Update thisMBB to fall through to newMBB
11731  thisMBB->addSuccessor(newMBB);
11732
11733  // newMBB jumps to newMBB and fall through to nextMBB
11734  newMBB->addSuccessor(nextMBB);
11735  newMBB->addSuccessor(newMBB);
11736
11737  DebugLoc dl = mInstr->getDebugLoc();
11738  // Insert instructions into newMBB based on incoming instruction
11739  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
11740         "unexpected number of operands");
11741  MachineOperand& destOper = mInstr->getOperand(0);
11742  MachineOperand* argOpers[2 + X86::AddrNumOperands];
11743  int numArgs = mInstr->getNumOperands() - 1;
11744  for (int i=0; i < numArgs; ++i)
11745    argOpers[i] = &mInstr->getOperand(i+1);
11746
11747  // x86 address has 4 operands: base, index, scale, and displacement
11748  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
11749  int valArgIndx = lastAddrIndx + 1;
11750
11751  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
11752  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
11753  for (int i=0; i <= lastAddrIndx; ++i)
11754    (*MIB).addOperand(*argOpers[i]);
11755
11756  // We only support register and immediate values
11757  assert((argOpers[valArgIndx]->isReg() ||
11758          argOpers[valArgIndx]->isImm()) &&
11759         "invalid operand");
11760
11761  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
11762  if (argOpers[valArgIndx]->isReg())
11763    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
11764  else
11765    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
11766  (*MIB).addOperand(*argOpers[valArgIndx]);
11767
11768  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
11769  MIB.addReg(t1);
11770
11771  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
11772  MIB.addReg(t1);
11773  MIB.addReg(t2);
11774
11775  // Generate movc
11776  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
11777  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
11778  MIB.addReg(t2);
11779  MIB.addReg(t1);
11780
11781  // Cmp and exchange if none has modified the memory location
11782  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
11783  for (int i=0; i <= lastAddrIndx; ++i)
11784    (*MIB).addOperand(*argOpers[i]);
11785  MIB.addReg(t3);
11786  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
11787  (*MIB).setMemRefs(mInstr->memoperands_begin(),
11788                    mInstr->memoperands_end());
11789
11790  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
11791  MIB.addReg(X86::EAX);
11792
11793  // insert branch
11794  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
11795
11796  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
11797  return nextMBB;
11798}
11799
11800// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
11801// or XMM0_V32I8 in AVX all of this code can be replaced with that
11802// in the .td file.
11803MachineBasicBlock *
11804X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
11805                            unsigned numArgs, bool memArg) const {
11806  assert(Subtarget->hasSSE42() &&
11807         "Target must have SSE4.2 or AVX features enabled");
11808
11809  DebugLoc dl = MI->getDebugLoc();
11810  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11811  unsigned Opc;
11812  if (!Subtarget->hasAVX()) {
11813    if (memArg)
11814      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
11815    else
11816      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
11817  } else {
11818    if (memArg)
11819      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
11820    else
11821      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
11822  }
11823
11824  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
11825  for (unsigned i = 0; i < numArgs; ++i) {
11826    MachineOperand &Op = MI->getOperand(i+1);
11827    if (!(Op.isReg() && Op.isImplicit()))
11828      MIB.addOperand(Op);
11829  }
11830  BuildMI(*BB, MI, dl,
11831    TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
11832             MI->getOperand(0).getReg())
11833    .addReg(X86::XMM0);
11834
11835  MI->eraseFromParent();
11836  return BB;
11837}
11838
11839MachineBasicBlock *
11840X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
11841  DebugLoc dl = MI->getDebugLoc();
11842  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11843
11844  // Address into RAX/EAX, other two args into ECX, EDX.
11845  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
11846  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
11847  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
11848  for (int i = 0; i < X86::AddrNumOperands; ++i)
11849    MIB.addOperand(MI->getOperand(i));
11850
11851  unsigned ValOps = X86::AddrNumOperands;
11852  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
11853    .addReg(MI->getOperand(ValOps).getReg());
11854  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
11855    .addReg(MI->getOperand(ValOps+1).getReg());
11856
11857  // The instruction doesn't actually take any operands though.
11858  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
11859
11860  MI->eraseFromParent(); // The pseudo is gone now.
11861  return BB;
11862}
11863
11864MachineBasicBlock *
11865X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
11866  DebugLoc dl = MI->getDebugLoc();
11867  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11868
11869  // First arg in ECX, the second in EAX.
11870  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
11871    .addReg(MI->getOperand(0).getReg());
11872  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
11873    .addReg(MI->getOperand(1).getReg());
11874
11875  // The instruction doesn't actually take any operands though.
11876  BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
11877
11878  MI->eraseFromParent(); // The pseudo is gone now.
11879  return BB;
11880}
11881
11882MachineBasicBlock *
11883X86TargetLowering::EmitVAARG64WithCustomInserter(
11884                   MachineInstr *MI,
11885                   MachineBasicBlock *MBB) const {
11886  // Emit va_arg instruction on X86-64.
11887
11888  // Operands to this pseudo-instruction:
11889  // 0  ) Output        : destination address (reg)
11890  // 1-5) Input         : va_list address (addr, i64mem)
11891  // 6  ) ArgSize       : Size (in bytes) of vararg type
11892  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
11893  // 8  ) Align         : Alignment of type
11894  // 9  ) EFLAGS (implicit-def)
11895
11896  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
11897  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
11898
11899  unsigned DestReg = MI->getOperand(0).getReg();
11900  MachineOperand &Base = MI->getOperand(1);
11901  MachineOperand &Scale = MI->getOperand(2);
11902  MachineOperand &Index = MI->getOperand(3);
11903  MachineOperand &Disp = MI->getOperand(4);
11904  MachineOperand &Segment = MI->getOperand(5);
11905  unsigned ArgSize = MI->getOperand(6).getImm();
11906  unsigned ArgMode = MI->getOperand(7).getImm();
11907  unsigned Align = MI->getOperand(8).getImm();
11908
11909  // Memory Reference
11910  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
11911  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
11912  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
11913
11914  // Machine Information
11915  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11916  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11917  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
11918  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
11919  DebugLoc DL = MI->getDebugLoc();
11920
11921  // struct va_list {
11922  //   i32   gp_offset
11923  //   i32   fp_offset
11924  //   i64   overflow_area (address)
11925  //   i64   reg_save_area (address)
11926  // }
11927  // sizeof(va_list) = 24
11928  // alignment(va_list) = 8
11929
11930  unsigned TotalNumIntRegs = 6;
11931  unsigned TotalNumXMMRegs = 8;
11932  bool UseGPOffset = (ArgMode == 1);
11933  bool UseFPOffset = (ArgMode == 2);
11934  unsigned MaxOffset = TotalNumIntRegs * 8 +
11935                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
11936
11937  /* Align ArgSize to a multiple of 8 */
11938  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
11939  bool NeedsAlign = (Align > 8);
11940
11941  MachineBasicBlock *thisMBB = MBB;
11942  MachineBasicBlock *overflowMBB;
11943  MachineBasicBlock *offsetMBB;
11944  MachineBasicBlock *endMBB;
11945
11946  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
11947  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
11948  unsigned OffsetReg = 0;
11949
11950  if (!UseGPOffset && !UseFPOffset) {
11951    // If we only pull from the overflow region, we don't create a branch.
11952    // We don't need to alter control flow.
11953    OffsetDestReg = 0; // unused
11954    OverflowDestReg = DestReg;
11955
11956    offsetMBB = NULL;
11957    overflowMBB = thisMBB;
11958    endMBB = thisMBB;
11959  } else {
11960    // First emit code to check if gp_offset (or fp_offset) is below the bound.
11961    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
11962    // If not, pull from overflow_area. (branch to overflowMBB)
11963    //
11964    //       thisMBB
11965    //         |     .
11966    //         |        .
11967    //     offsetMBB   overflowMBB
11968    //         |        .
11969    //         |     .
11970    //        endMBB
11971
11972    // Registers for the PHI in endMBB
11973    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
11974    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
11975
11976    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11977    MachineFunction *MF = MBB->getParent();
11978    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11979    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11980    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11981
11982    MachineFunction::iterator MBBIter = MBB;
11983    ++MBBIter;
11984
11985    // Insert the new basic blocks
11986    MF->insert(MBBIter, offsetMBB);
11987    MF->insert(MBBIter, overflowMBB);
11988    MF->insert(MBBIter, endMBB);
11989
11990    // Transfer the remainder of MBB and its successor edges to endMBB.
11991    endMBB->splice(endMBB->begin(), thisMBB,
11992                    llvm::next(MachineBasicBlock::iterator(MI)),
11993                    thisMBB->end());
11994    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
11995
11996    // Make offsetMBB and overflowMBB successors of thisMBB
11997    thisMBB->addSuccessor(offsetMBB);
11998    thisMBB->addSuccessor(overflowMBB);
11999
12000    // endMBB is a successor of both offsetMBB and overflowMBB
12001    offsetMBB->addSuccessor(endMBB);
12002    overflowMBB->addSuccessor(endMBB);
12003
12004    // Load the offset value into a register
12005    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
12006    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
12007      .addOperand(Base)
12008      .addOperand(Scale)
12009      .addOperand(Index)
12010      .addDisp(Disp, UseFPOffset ? 4 : 0)
12011      .addOperand(Segment)
12012      .setMemRefs(MMOBegin, MMOEnd);
12013
12014    // Check if there is enough room left to pull this argument.
12015    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
12016      .addReg(OffsetReg)
12017      .addImm(MaxOffset + 8 - ArgSizeA8);
12018
12019    // Branch to "overflowMBB" if offset >= max
12020    // Fall through to "offsetMBB" otherwise
12021    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
12022      .addMBB(overflowMBB);
12023  }
12024
12025  // In offsetMBB, emit code to use the reg_save_area.
12026  if (offsetMBB) {
12027    assert(OffsetReg != 0);
12028
12029    // Read the reg_save_area address.
12030    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
12031    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
12032      .addOperand(Base)
12033      .addOperand(Scale)
12034      .addOperand(Index)
12035      .addDisp(Disp, 16)
12036      .addOperand(Segment)
12037      .setMemRefs(MMOBegin, MMOEnd);
12038
12039    // Zero-extend the offset
12040    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
12041      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
12042        .addImm(0)
12043        .addReg(OffsetReg)
12044        .addImm(X86::sub_32bit);
12045
12046    // Add the offset to the reg_save_area to get the final address.
12047    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
12048      .addReg(OffsetReg64)
12049      .addReg(RegSaveReg);
12050
12051    // Compute the offset for the next argument
12052    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
12053    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
12054      .addReg(OffsetReg)
12055      .addImm(UseFPOffset ? 16 : 8);
12056
12057    // Store it back into the va_list.
12058    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
12059      .addOperand(Base)
12060      .addOperand(Scale)
12061      .addOperand(Index)
12062      .addDisp(Disp, UseFPOffset ? 4 : 0)
12063      .addOperand(Segment)
12064      .addReg(NextOffsetReg)
12065      .setMemRefs(MMOBegin, MMOEnd);
12066
12067    // Jump to endMBB
12068    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
12069      .addMBB(endMBB);
12070  }
12071
12072  //
12073  // Emit code to use overflow area
12074  //
12075
12076  // Load the overflow_area address into a register.
12077  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
12078  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
12079    .addOperand(Base)
12080    .addOperand(Scale)
12081    .addOperand(Index)
12082    .addDisp(Disp, 8)
12083    .addOperand(Segment)
12084    .setMemRefs(MMOBegin, MMOEnd);
12085
12086  // If we need to align it, do so. Otherwise, just copy the address
12087  // to OverflowDestReg.
12088  if (NeedsAlign) {
12089    // Align the overflow address
12090    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
12091    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
12092
12093    // aligned_addr = (addr + (align-1)) & ~(align-1)
12094    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
12095      .addReg(OverflowAddrReg)
12096      .addImm(Align-1);
12097
12098    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
12099      .addReg(TmpReg)
12100      .addImm(~(uint64_t)(Align-1));
12101  } else {
12102    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
12103      .addReg(OverflowAddrReg);
12104  }
12105
12106  // Compute the next overflow address after this argument.
12107  // (the overflow address should be kept 8-byte aligned)
12108  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
12109  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
12110    .addReg(OverflowDestReg)
12111    .addImm(ArgSizeA8);
12112
12113  // Store the new overflow address.
12114  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
12115    .addOperand(Base)
12116    .addOperand(Scale)
12117    .addOperand(Index)
12118    .addDisp(Disp, 8)
12119    .addOperand(Segment)
12120    .addReg(NextAddrReg)
12121    .setMemRefs(MMOBegin, MMOEnd);
12122
12123  // If we branched, emit the PHI to the front of endMBB.
12124  if (offsetMBB) {
12125    BuildMI(*endMBB, endMBB->begin(), DL,
12126            TII->get(X86::PHI), DestReg)
12127      .addReg(OffsetDestReg).addMBB(offsetMBB)
12128      .addReg(OverflowDestReg).addMBB(overflowMBB);
12129  }
12130
12131  // Erase the pseudo instruction
12132  MI->eraseFromParent();
12133
12134  return endMBB;
12135}
12136
12137MachineBasicBlock *
12138X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
12139                                                 MachineInstr *MI,
12140                                                 MachineBasicBlock *MBB) const {
12141  // Emit code to save XMM registers to the stack. The ABI says that the
12142  // number of registers to save is given in %al, so it's theoretically
12143  // possible to do an indirect jump trick to avoid saving all of them,
12144  // however this code takes a simpler approach and just executes all
12145  // of the stores if %al is non-zero. It's less code, and it's probably
12146  // easier on the hardware branch predictor, and stores aren't all that
12147  // expensive anyway.
12148
12149  // Create the new basic blocks. One block contains all the XMM stores,
12150  // and one block is the final destination regardless of whether any
12151  // stores were performed.
12152  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
12153  MachineFunction *F = MBB->getParent();
12154  MachineFunction::iterator MBBIter = MBB;
12155  ++MBBIter;
12156  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
12157  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
12158  F->insert(MBBIter, XMMSaveMBB);
12159  F->insert(MBBIter, EndMBB);
12160
12161  // Transfer the remainder of MBB and its successor edges to EndMBB.
12162  EndMBB->splice(EndMBB->begin(), MBB,
12163                 llvm::next(MachineBasicBlock::iterator(MI)),
12164                 MBB->end());
12165  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
12166
12167  // The original block will now fall through to the XMM save block.
12168  MBB->addSuccessor(XMMSaveMBB);
12169  // The XMMSaveMBB will fall through to the end block.
12170  XMMSaveMBB->addSuccessor(EndMBB);
12171
12172  // Now add the instructions.
12173  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12174  DebugLoc DL = MI->getDebugLoc();
12175
12176  unsigned CountReg = MI->getOperand(0).getReg();
12177  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
12178  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
12179
12180  if (!Subtarget->isTargetWin64()) {
12181    // If %al is 0, branch around the XMM save block.
12182    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
12183    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
12184    MBB->addSuccessor(EndMBB);
12185  }
12186
12187  unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
12188  // In the XMM save block, save all the XMM argument registers.
12189  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
12190    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
12191    MachineMemOperand *MMO =
12192      F->getMachineMemOperand(
12193          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
12194        MachineMemOperand::MOStore,
12195        /*Size=*/16, /*Align=*/16);
12196    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
12197      .addFrameIndex(RegSaveFrameIndex)
12198      .addImm(/*Scale=*/1)
12199      .addReg(/*IndexReg=*/0)
12200      .addImm(/*Disp=*/Offset)
12201      .addReg(/*Segment=*/0)
12202      .addReg(MI->getOperand(i).getReg())
12203      .addMemOperand(MMO);
12204  }
12205
12206  MI->eraseFromParent();   // The pseudo instruction is gone now.
12207
12208  return EndMBB;
12209}
12210
12211// The EFLAGS operand of SelectItr might be missing a kill marker
12212// because there were multiple uses of EFLAGS, and ISel didn't know
12213// which to mark. Figure out whether SelectItr should have had a
12214// kill marker, and set it if it should. Returns the correct kill
12215// marker value.
12216static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
12217                                     MachineBasicBlock* BB,
12218                                     const TargetRegisterInfo* TRI) {
12219  // Scan forward through BB for a use/def of EFLAGS.
12220  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
12221  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
12222    const MachineInstr& mi = *miI;
12223    if (mi.readsRegister(X86::EFLAGS))
12224      return false;
12225    if (mi.definesRegister(X86::EFLAGS))
12226      break; // Should have kill-flag - update below.
12227  }
12228
12229  // If we hit the end of the block, check whether EFLAGS is live into a
12230  // successor.
12231  if (miI == BB->end()) {
12232    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
12233                                          sEnd = BB->succ_end();
12234         sItr != sEnd; ++sItr) {
12235      MachineBasicBlock* succ = *sItr;
12236      if (succ->isLiveIn(X86::EFLAGS))
12237        return false;
12238    }
12239  }
12240
12241  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
12242  // out. SelectMI should have a kill flag on EFLAGS.
12243  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
12244  return true;
12245}
12246
12247MachineBasicBlock *
12248X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
12249                                     MachineBasicBlock *BB) const {
12250  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12251  DebugLoc DL = MI->getDebugLoc();
12252
12253  // To "insert" a SELECT_CC instruction, we actually have to insert the
12254  // diamond control-flow pattern.  The incoming instruction knows the
12255  // destination vreg to set, the condition code register to branch on, the
12256  // true/false values to select between, and a branch opcode to use.
12257  const BasicBlock *LLVM_BB = BB->getBasicBlock();
12258  MachineFunction::iterator It = BB;
12259  ++It;
12260
12261  //  thisMBB:
12262  //  ...
12263  //   TrueVal = ...
12264  //   cmpTY ccX, r1, r2
12265  //   bCC copy1MBB
12266  //   fallthrough --> copy0MBB
12267  MachineBasicBlock *thisMBB = BB;
12268  MachineFunction *F = BB->getParent();
12269  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12270  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12271  F->insert(It, copy0MBB);
12272  F->insert(It, sinkMBB);
12273
12274  // If the EFLAGS register isn't dead in the terminator, then claim that it's
12275  // live into the sink and copy blocks.
12276  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
12277  if (!MI->killsRegister(X86::EFLAGS) &&
12278      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
12279    copy0MBB->addLiveIn(X86::EFLAGS);
12280    sinkMBB->addLiveIn(X86::EFLAGS);
12281  }
12282
12283  // Transfer the remainder of BB and its successor edges to sinkMBB.
12284  sinkMBB->splice(sinkMBB->begin(), BB,
12285                  llvm::next(MachineBasicBlock::iterator(MI)),
12286                  BB->end());
12287  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
12288
12289  // Add the true and fallthrough blocks as its successors.
12290  BB->addSuccessor(copy0MBB);
12291  BB->addSuccessor(sinkMBB);
12292
12293  // Create the conditional branch instruction.
12294  unsigned Opc =
12295    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
12296  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
12297
12298  //  copy0MBB:
12299  //   %FalseValue = ...
12300  //   # fallthrough to sinkMBB
12301  copy0MBB->addSuccessor(sinkMBB);
12302
12303  //  sinkMBB:
12304  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12305  //  ...
12306  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12307          TII->get(X86::PHI), MI->getOperand(0).getReg())
12308    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
12309    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
12310
12311  MI->eraseFromParent();   // The pseudo instruction is gone now.
12312  return sinkMBB;
12313}
12314
12315MachineBasicBlock *
12316X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
12317                                        bool Is64Bit) const {
12318  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12319  DebugLoc DL = MI->getDebugLoc();
12320  MachineFunction *MF = BB->getParent();
12321  const BasicBlock *LLVM_BB = BB->getBasicBlock();
12322
12323  assert(getTargetMachine().Options.EnableSegmentedStacks);
12324
12325  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
12326  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
12327
12328  // BB:
12329  //  ... [Till the alloca]
12330  // If stacklet is not large enough, jump to mallocMBB
12331  //
12332  // bumpMBB:
12333  //  Allocate by subtracting from RSP
12334  //  Jump to continueMBB
12335  //
12336  // mallocMBB:
12337  //  Allocate by call to runtime
12338  //
12339  // continueMBB:
12340  //  ...
12341  //  [rest of original BB]
12342  //
12343
12344  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12345  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12346  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12347
12348  MachineRegisterInfo &MRI = MF->getRegInfo();
12349  const TargetRegisterClass *AddrRegClass =
12350    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
12351
12352  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
12353    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
12354    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
12355    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
12356    sizeVReg = MI->getOperand(1).getReg(),
12357    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
12358
12359  MachineFunction::iterator MBBIter = BB;
12360  ++MBBIter;
12361
12362  MF->insert(MBBIter, bumpMBB);
12363  MF->insert(MBBIter, mallocMBB);
12364  MF->insert(MBBIter, continueMBB);
12365
12366  continueMBB->splice(continueMBB->begin(), BB, llvm::next
12367                      (MachineBasicBlock::iterator(MI)), BB->end());
12368  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
12369
12370  // Add code to the main basic block to check if the stack limit has been hit,
12371  // and if so, jump to mallocMBB otherwise to bumpMBB.
12372  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
12373  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
12374    .addReg(tmpSPVReg).addReg(sizeVReg);
12375  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
12376    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
12377    .addReg(SPLimitVReg);
12378  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
12379
12380  // bumpMBB simply decreases the stack pointer, since we know the current
12381  // stacklet has enough space.
12382  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
12383    .addReg(SPLimitVReg);
12384  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
12385    .addReg(SPLimitVReg);
12386  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
12387
12388  // Calls into a routine in libgcc to allocate more space from the heap.
12389  const uint32_t *RegMask =
12390    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
12391  if (Is64Bit) {
12392    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
12393      .addReg(sizeVReg);
12394    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
12395      .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
12396      .addRegMask(RegMask)
12397      .addReg(X86::RAX, RegState::ImplicitDefine);
12398  } else {
12399    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
12400      .addImm(12);
12401    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
12402    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
12403      .addExternalSymbol("__morestack_allocate_stack_space")
12404      .addRegMask(RegMask)
12405      .addReg(X86::EAX, RegState::ImplicitDefine);
12406  }
12407
12408  if (!Is64Bit)
12409    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
12410      .addImm(16);
12411
12412  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
12413    .addReg(Is64Bit ? X86::RAX : X86::EAX);
12414  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
12415
12416  // Set up the CFG correctly.
12417  BB->addSuccessor(bumpMBB);
12418  BB->addSuccessor(mallocMBB);
12419  mallocMBB->addSuccessor(continueMBB);
12420  bumpMBB->addSuccessor(continueMBB);
12421
12422  // Take care of the PHI nodes.
12423  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
12424          MI->getOperand(0).getReg())
12425    .addReg(mallocPtrVReg).addMBB(mallocMBB)
12426    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
12427
12428  // Delete the original pseudo instruction.
12429  MI->eraseFromParent();
12430
12431  // And we're done.
12432  return continueMBB;
12433}
12434
12435MachineBasicBlock *
12436X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
12437                                          MachineBasicBlock *BB) const {
12438  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12439  DebugLoc DL = MI->getDebugLoc();
12440
12441  assert(!Subtarget->isTargetEnvMacho());
12442
12443  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
12444  // non-trivial part is impdef of ESP.
12445
12446  if (Subtarget->isTargetWin64()) {
12447    if (Subtarget->isTargetCygMing()) {
12448      // ___chkstk(Mingw64):
12449      // Clobbers R10, R11, RAX and EFLAGS.
12450      // Updates RSP.
12451      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
12452        .addExternalSymbol("___chkstk")
12453        .addReg(X86::RAX, RegState::Implicit)
12454        .addReg(X86::RSP, RegState::Implicit)
12455        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
12456        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
12457        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
12458    } else {
12459      // __chkstk(MSVCRT): does not update stack pointer.
12460      // Clobbers R10, R11 and EFLAGS.
12461      // FIXME: RAX(allocated size) might be reused and not killed.
12462      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
12463        .addExternalSymbol("__chkstk")
12464        .addReg(X86::RAX, RegState::Implicit)
12465        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
12466      // RAX has the offset to subtracted from RSP.
12467      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
12468        .addReg(X86::RSP)
12469        .addReg(X86::RAX);
12470    }
12471  } else {
12472    const char *StackProbeSymbol =
12473      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
12474
12475    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
12476      .addExternalSymbol(StackProbeSymbol)
12477      .addReg(X86::EAX, RegState::Implicit)
12478      .addReg(X86::ESP, RegState::Implicit)
12479      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
12480      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
12481      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
12482  }
12483
12484  MI->eraseFromParent();   // The pseudo instruction is gone now.
12485  return BB;
12486}
12487
12488MachineBasicBlock *
12489X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
12490                                      MachineBasicBlock *BB) const {
12491  // This is pretty easy.  We're taking the value that we received from
12492  // our load from the relocation, sticking it in either RDI (x86-64)
12493  // or EAX and doing an indirect call.  The return value will then
12494  // be in the normal return register.
12495  const X86InstrInfo *TII
12496    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
12497  DebugLoc DL = MI->getDebugLoc();
12498  MachineFunction *F = BB->getParent();
12499
12500  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
12501  assert(MI->getOperand(3).isGlobal() && "This should be a global");
12502
12503  // Get a register mask for the lowered call.
12504  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
12505  // proper register mask.
12506  const uint32_t *RegMask =
12507    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
12508  if (Subtarget->is64Bit()) {
12509    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
12510                                      TII->get(X86::MOV64rm), X86::RDI)
12511    .addReg(X86::RIP)
12512    .addImm(0).addReg(0)
12513    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
12514                      MI->getOperand(3).getTargetFlags())
12515    .addReg(0);
12516    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
12517    addDirectMem(MIB, X86::RDI);
12518    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
12519  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
12520    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
12521                                      TII->get(X86::MOV32rm), X86::EAX)
12522    .addReg(0)
12523    .addImm(0).addReg(0)
12524    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
12525                      MI->getOperand(3).getTargetFlags())
12526    .addReg(0);
12527    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
12528    addDirectMem(MIB, X86::EAX);
12529    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
12530  } else {
12531    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
12532                                      TII->get(X86::MOV32rm), X86::EAX)
12533    .addReg(TII->getGlobalBaseReg(F))
12534    .addImm(0).addReg(0)
12535    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
12536                      MI->getOperand(3).getTargetFlags())
12537    .addReg(0);
12538    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
12539    addDirectMem(MIB, X86::EAX);
12540    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
12541  }
12542
12543  MI->eraseFromParent(); // The pseudo instruction is gone now.
12544  return BB;
12545}
12546
12547MachineBasicBlock *
12548X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
12549                                               MachineBasicBlock *BB) const {
12550  switch (MI->getOpcode()) {
12551  default: llvm_unreachable("Unexpected instr type to insert");
12552  case X86::TAILJMPd64:
12553  case X86::TAILJMPr64:
12554  case X86::TAILJMPm64:
12555    llvm_unreachable("TAILJMP64 would not be touched here.");
12556  case X86::TCRETURNdi64:
12557  case X86::TCRETURNri64:
12558  case X86::TCRETURNmi64:
12559    return BB;
12560  case X86::WIN_ALLOCA:
12561    return EmitLoweredWinAlloca(MI, BB);
12562  case X86::SEG_ALLOCA_32:
12563    return EmitLoweredSegAlloca(MI, BB, false);
12564  case X86::SEG_ALLOCA_64:
12565    return EmitLoweredSegAlloca(MI, BB, true);
12566  case X86::TLSCall_32:
12567  case X86::TLSCall_64:
12568    return EmitLoweredTLSCall(MI, BB);
12569  case X86::CMOV_GR8:
12570  case X86::CMOV_FR32:
12571  case X86::CMOV_FR64:
12572  case X86::CMOV_V4F32:
12573  case X86::CMOV_V2F64:
12574  case X86::CMOV_V2I64:
12575  case X86::CMOV_V8F32:
12576  case X86::CMOV_V4F64:
12577  case X86::CMOV_V4I64:
12578  case X86::CMOV_GR16:
12579  case X86::CMOV_GR32:
12580  case X86::CMOV_RFP32:
12581  case X86::CMOV_RFP64:
12582  case X86::CMOV_RFP80:
12583    return EmitLoweredSelect(MI, BB);
12584
12585  case X86::FP32_TO_INT16_IN_MEM:
12586  case X86::FP32_TO_INT32_IN_MEM:
12587  case X86::FP32_TO_INT64_IN_MEM:
12588  case X86::FP64_TO_INT16_IN_MEM:
12589  case X86::FP64_TO_INT32_IN_MEM:
12590  case X86::FP64_TO_INT64_IN_MEM:
12591  case X86::FP80_TO_INT16_IN_MEM:
12592  case X86::FP80_TO_INT32_IN_MEM:
12593  case X86::FP80_TO_INT64_IN_MEM: {
12594    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12595    DebugLoc DL = MI->getDebugLoc();
12596
12597    // Change the floating point control register to use "round towards zero"
12598    // mode when truncating to an integer value.
12599    MachineFunction *F = BB->getParent();
12600    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
12601    addFrameReference(BuildMI(*BB, MI, DL,
12602                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
12603
12604    // Load the old value of the high byte of the control word...
12605    unsigned OldCW =
12606      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
12607    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
12608                      CWFrameIdx);
12609
12610    // Set the high part to be round to zero...
12611    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
12612      .addImm(0xC7F);
12613
12614    // Reload the modified control word now...
12615    addFrameReference(BuildMI(*BB, MI, DL,
12616                              TII->get(X86::FLDCW16m)), CWFrameIdx);
12617
12618    // Restore the memory image of control word to original value
12619    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
12620      .addReg(OldCW);
12621
12622    // Get the X86 opcode to use.
12623    unsigned Opc;
12624    switch (MI->getOpcode()) {
12625    default: llvm_unreachable("illegal opcode!");
12626    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
12627    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
12628    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
12629    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
12630    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
12631    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
12632    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
12633    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
12634    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
12635    }
12636
12637    X86AddressMode AM;
12638    MachineOperand &Op = MI->getOperand(0);
12639    if (Op.isReg()) {
12640      AM.BaseType = X86AddressMode::RegBase;
12641      AM.Base.Reg = Op.getReg();
12642    } else {
12643      AM.BaseType = X86AddressMode::FrameIndexBase;
12644      AM.Base.FrameIndex = Op.getIndex();
12645    }
12646    Op = MI->getOperand(1);
12647    if (Op.isImm())
12648      AM.Scale = Op.getImm();
12649    Op = MI->getOperand(2);
12650    if (Op.isImm())
12651      AM.IndexReg = Op.getImm();
12652    Op = MI->getOperand(3);
12653    if (Op.isGlobal()) {
12654      AM.GV = Op.getGlobal();
12655    } else {
12656      AM.Disp = Op.getImm();
12657    }
12658    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
12659                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
12660
12661    // Reload the original control word now.
12662    addFrameReference(BuildMI(*BB, MI, DL,
12663                              TII->get(X86::FLDCW16m)), CWFrameIdx);
12664
12665    MI->eraseFromParent();   // The pseudo instruction is gone now.
12666    return BB;
12667  }
12668    // String/text processing lowering.
12669  case X86::PCMPISTRM128REG:
12670  case X86::VPCMPISTRM128REG:
12671    return EmitPCMP(MI, BB, 3, false /* in-mem */);
12672  case X86::PCMPISTRM128MEM:
12673  case X86::VPCMPISTRM128MEM:
12674    return EmitPCMP(MI, BB, 3, true /* in-mem */);
12675  case X86::PCMPESTRM128REG:
12676  case X86::VPCMPESTRM128REG:
12677    return EmitPCMP(MI, BB, 5, false /* in mem */);
12678  case X86::PCMPESTRM128MEM:
12679  case X86::VPCMPESTRM128MEM:
12680    return EmitPCMP(MI, BB, 5, true /* in mem */);
12681
12682    // Thread synchronization.
12683  case X86::MONITOR:
12684    return EmitMonitor(MI, BB);
12685  case X86::MWAIT:
12686    return EmitMwait(MI, BB);
12687
12688    // Atomic Lowering.
12689  case X86::ATOMAND32:
12690    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
12691                                               X86::AND32ri, X86::MOV32rm,
12692                                               X86::LCMPXCHG32,
12693                                               X86::NOT32r, X86::EAX,
12694                                               &X86::GR32RegClass);
12695  case X86::ATOMOR32:
12696    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
12697                                               X86::OR32ri, X86::MOV32rm,
12698                                               X86::LCMPXCHG32,
12699                                               X86::NOT32r, X86::EAX,
12700                                               &X86::GR32RegClass);
12701  case X86::ATOMXOR32:
12702    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
12703                                               X86::XOR32ri, X86::MOV32rm,
12704                                               X86::LCMPXCHG32,
12705                                               X86::NOT32r, X86::EAX,
12706                                               &X86::GR32RegClass);
12707  case X86::ATOMNAND32:
12708    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
12709                                               X86::AND32ri, X86::MOV32rm,
12710                                               X86::LCMPXCHG32,
12711                                               X86::NOT32r, X86::EAX,
12712                                               &X86::GR32RegClass, true);
12713  case X86::ATOMMIN32:
12714    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
12715  case X86::ATOMMAX32:
12716    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
12717  case X86::ATOMUMIN32:
12718    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
12719  case X86::ATOMUMAX32:
12720    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
12721
12722  case X86::ATOMAND16:
12723    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
12724                                               X86::AND16ri, X86::MOV16rm,
12725                                               X86::LCMPXCHG16,
12726                                               X86::NOT16r, X86::AX,
12727                                               &X86::GR16RegClass);
12728  case X86::ATOMOR16:
12729    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
12730                                               X86::OR16ri, X86::MOV16rm,
12731                                               X86::LCMPXCHG16,
12732                                               X86::NOT16r, X86::AX,
12733                                               &X86::GR16RegClass);
12734  case X86::ATOMXOR16:
12735    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
12736                                               X86::XOR16ri, X86::MOV16rm,
12737                                               X86::LCMPXCHG16,
12738                                               X86::NOT16r, X86::AX,
12739                                               &X86::GR16RegClass);
12740  case X86::ATOMNAND16:
12741    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
12742                                               X86::AND16ri, X86::MOV16rm,
12743                                               X86::LCMPXCHG16,
12744                                               X86::NOT16r, X86::AX,
12745                                               &X86::GR16RegClass, true);
12746  case X86::ATOMMIN16:
12747    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
12748  case X86::ATOMMAX16:
12749    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
12750  case X86::ATOMUMIN16:
12751    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
12752  case X86::ATOMUMAX16:
12753    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
12754
12755  case X86::ATOMAND8:
12756    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
12757                                               X86::AND8ri, X86::MOV8rm,
12758                                               X86::LCMPXCHG8,
12759                                               X86::NOT8r, X86::AL,
12760                                               &X86::GR8RegClass);
12761  case X86::ATOMOR8:
12762    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
12763                                               X86::OR8ri, X86::MOV8rm,
12764                                               X86::LCMPXCHG8,
12765                                               X86::NOT8r, X86::AL,
12766                                               &X86::GR8RegClass);
12767  case X86::ATOMXOR8:
12768    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
12769                                               X86::XOR8ri, X86::MOV8rm,
12770                                               X86::LCMPXCHG8,
12771                                               X86::NOT8r, X86::AL,
12772                                               &X86::GR8RegClass);
12773  case X86::ATOMNAND8:
12774    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
12775                                               X86::AND8ri, X86::MOV8rm,
12776                                               X86::LCMPXCHG8,
12777                                               X86::NOT8r, X86::AL,
12778                                               &X86::GR8RegClass, true);
12779  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
12780  // This group is for 64-bit host.
12781  case X86::ATOMAND64:
12782    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
12783                                               X86::AND64ri32, X86::MOV64rm,
12784                                               X86::LCMPXCHG64,
12785                                               X86::NOT64r, X86::RAX,
12786                                               &X86::GR64RegClass);
12787  case X86::ATOMOR64:
12788    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
12789                                               X86::OR64ri32, X86::MOV64rm,
12790                                               X86::LCMPXCHG64,
12791                                               X86::NOT64r, X86::RAX,
12792                                               &X86::GR64RegClass);
12793  case X86::ATOMXOR64:
12794    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
12795                                               X86::XOR64ri32, X86::MOV64rm,
12796                                               X86::LCMPXCHG64,
12797                                               X86::NOT64r, X86::RAX,
12798                                               &X86::GR64RegClass);
12799  case X86::ATOMNAND64:
12800    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
12801                                               X86::AND64ri32, X86::MOV64rm,
12802                                               X86::LCMPXCHG64,
12803                                               X86::NOT64r, X86::RAX,
12804                                               &X86::GR64RegClass, true);
12805  case X86::ATOMMIN64:
12806    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
12807  case X86::ATOMMAX64:
12808    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
12809  case X86::ATOMUMIN64:
12810    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
12811  case X86::ATOMUMAX64:
12812    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
12813
12814  // This group does 64-bit operations on a 32-bit host.
12815  case X86::ATOMAND6432:
12816    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12817                                               X86::AND32rr, X86::AND32rr,
12818                                               X86::AND32ri, X86::AND32ri,
12819                                               false);
12820  case X86::ATOMOR6432:
12821    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12822                                               X86::OR32rr, X86::OR32rr,
12823                                               X86::OR32ri, X86::OR32ri,
12824                                               false);
12825  case X86::ATOMXOR6432:
12826    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12827                                               X86::XOR32rr, X86::XOR32rr,
12828                                               X86::XOR32ri, X86::XOR32ri,
12829                                               false);
12830  case X86::ATOMNAND6432:
12831    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12832                                               X86::AND32rr, X86::AND32rr,
12833                                               X86::AND32ri, X86::AND32ri,
12834                                               true);
12835  case X86::ATOMADD6432:
12836    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12837                                               X86::ADD32rr, X86::ADC32rr,
12838                                               X86::ADD32ri, X86::ADC32ri,
12839                                               false);
12840  case X86::ATOMSUB6432:
12841    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12842                                               X86::SUB32rr, X86::SBB32rr,
12843                                               X86::SUB32ri, X86::SBB32ri,
12844                                               false);
12845  case X86::ATOMSWAP6432:
12846    return EmitAtomicBit6432WithCustomInserter(MI, BB,
12847                                               X86::MOV32rr, X86::MOV32rr,
12848                                               X86::MOV32ri, X86::MOV32ri,
12849                                               false);
12850  case X86::VASTART_SAVE_XMM_REGS:
12851    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
12852
12853  case X86::VAARG_64:
12854    return EmitVAARG64WithCustomInserter(MI, BB);
12855  }
12856}
12857
12858//===----------------------------------------------------------------------===//
12859//                           X86 Optimization Hooks
12860//===----------------------------------------------------------------------===//
12861
12862void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
12863                                                       APInt &KnownZero,
12864                                                       APInt &KnownOne,
12865                                                       const SelectionDAG &DAG,
12866                                                       unsigned Depth) const {
12867  unsigned BitWidth = KnownZero.getBitWidth();
12868  unsigned Opc = Op.getOpcode();
12869  assert((Opc >= ISD::BUILTIN_OP_END ||
12870          Opc == ISD::INTRINSIC_WO_CHAIN ||
12871          Opc == ISD::INTRINSIC_W_CHAIN ||
12872          Opc == ISD::INTRINSIC_VOID) &&
12873         "Should use MaskedValueIsZero if you don't know whether Op"
12874         " is a target node!");
12875
12876  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
12877  switch (Opc) {
12878  default: break;
12879  case X86ISD::ADD:
12880  case X86ISD::SUB:
12881  case X86ISD::ADC:
12882  case X86ISD::SBB:
12883  case X86ISD::SMUL:
12884  case X86ISD::UMUL:
12885  case X86ISD::INC:
12886  case X86ISD::DEC:
12887  case X86ISD::OR:
12888  case X86ISD::XOR:
12889  case X86ISD::AND:
12890    // These nodes' second result is a boolean.
12891    if (Op.getResNo() == 0)
12892      break;
12893    // Fallthrough
12894  case X86ISD::SETCC:
12895    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
12896    break;
12897  case ISD::INTRINSIC_WO_CHAIN: {
12898    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
12899    unsigned NumLoBits = 0;
12900    switch (IntId) {
12901    default: break;
12902    case Intrinsic::x86_sse_movmsk_ps:
12903    case Intrinsic::x86_avx_movmsk_ps_256:
12904    case Intrinsic::x86_sse2_movmsk_pd:
12905    case Intrinsic::x86_avx_movmsk_pd_256:
12906    case Intrinsic::x86_mmx_pmovmskb:
12907    case Intrinsic::x86_sse2_pmovmskb_128:
12908    case Intrinsic::x86_avx2_pmovmskb: {
12909      // High bits of movmskp{s|d}, pmovmskb are known zero.
12910      switch (IntId) {
12911        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
12912        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
12913        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
12914        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
12915        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
12916        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
12917        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
12918        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
12919      }
12920      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
12921      break;
12922    }
12923    }
12924    break;
12925  }
12926  }
12927}
12928
12929unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
12930                                                         unsigned Depth) const {
12931  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
12932  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
12933    return Op.getValueType().getScalarType().getSizeInBits();
12934
12935  // Fallback case.
12936  return 1;
12937}
12938
12939/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
12940/// node is a GlobalAddress + offset.
12941bool X86TargetLowering::isGAPlusOffset(SDNode *N,
12942                                       const GlobalValue* &GA,
12943                                       int64_t &Offset) const {
12944  if (N->getOpcode() == X86ISD::Wrapper) {
12945    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
12946      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
12947      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
12948      return true;
12949    }
12950  }
12951  return TargetLowering::isGAPlusOffset(N, GA, Offset);
12952}
12953
12954/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
12955/// same as extracting the high 128-bit part of 256-bit vector and then
12956/// inserting the result into the low part of a new 256-bit vector
12957static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
12958  EVT VT = SVOp->getValueType(0);
12959  unsigned NumElems = VT.getVectorNumElements();
12960
12961  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12962  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
12963    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
12964        SVOp->getMaskElt(j) >= 0)
12965      return false;
12966
12967  return true;
12968}
12969
12970/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
12971/// same as extracting the low 128-bit part of 256-bit vector and then
12972/// inserting the result into the high part of a new 256-bit vector
12973static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
12974  EVT VT = SVOp->getValueType(0);
12975  unsigned NumElems = VT.getVectorNumElements();
12976
12977  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12978  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
12979    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
12980        SVOp->getMaskElt(j) >= 0)
12981      return false;
12982
12983  return true;
12984}
12985
12986/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
12987static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
12988                                        TargetLowering::DAGCombinerInfo &DCI,
12989                                        const X86Subtarget* Subtarget) {
12990  DebugLoc dl = N->getDebugLoc();
12991  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
12992  SDValue V1 = SVOp->getOperand(0);
12993  SDValue V2 = SVOp->getOperand(1);
12994  EVT VT = SVOp->getValueType(0);
12995  unsigned NumElems = VT.getVectorNumElements();
12996
12997  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
12998      V2.getOpcode() == ISD::CONCAT_VECTORS) {
12999    //
13000    //                   0,0,0,...
13001    //                      |
13002    //    V      UNDEF    BUILD_VECTOR    UNDEF
13003    //     \      /           \           /
13004    //  CONCAT_VECTOR         CONCAT_VECTOR
13005    //         \                  /
13006    //          \                /
13007    //          RESULT: V + zero extended
13008    //
13009    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
13010        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
13011        V1.getOperand(1).getOpcode() != ISD::UNDEF)
13012      return SDValue();
13013
13014    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
13015      return SDValue();
13016
13017    // To match the shuffle mask, the first half of the mask should
13018    // be exactly the first vector, and all the rest a splat with the
13019    // first element of the second one.
13020    for (unsigned i = 0; i != NumElems/2; ++i)
13021      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
13022          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
13023        return SDValue();
13024
13025    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
13026    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
13027      SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
13028      SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
13029      SDValue ResNode =
13030        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
13031                                Ld->getMemoryVT(),
13032                                Ld->getPointerInfo(),
13033                                Ld->getAlignment(),
13034                                false/*isVolatile*/, true/*ReadMem*/,
13035                                false/*WriteMem*/);
13036      return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
13037    }
13038
13039    // Emit a zeroed vector and insert the desired subvector on its
13040    // first half.
13041    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
13042    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
13043    return DCI.CombineTo(N, InsV);
13044  }
13045
13046  //===--------------------------------------------------------------------===//
13047  // Combine some shuffles into subvector extracts and inserts:
13048  //
13049
13050  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13051  if (isShuffleHigh128VectorInsertLow(SVOp)) {
13052    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
13053    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
13054    return DCI.CombineTo(N, InsV);
13055  }
13056
13057  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13058  if (isShuffleLow128VectorInsertHigh(SVOp)) {
13059    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
13060    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
13061    return DCI.CombineTo(N, InsV);
13062  }
13063
13064  return SDValue();
13065}
13066
13067/// PerformShuffleCombine - Performs several different shuffle combines.
13068static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
13069                                     TargetLowering::DAGCombinerInfo &DCI,
13070                                     const X86Subtarget *Subtarget) {
13071  DebugLoc dl = N->getDebugLoc();
13072  EVT VT = N->getValueType(0);
13073
13074  // Don't create instructions with illegal types after legalize types has run.
13075  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13076  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
13077    return SDValue();
13078
13079  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
13080  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
13081      N->getOpcode() == ISD::VECTOR_SHUFFLE)
13082    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
13083
13084  // Only handle 128 wide vector from here on.
13085  if (VT.getSizeInBits() != 128)
13086    return SDValue();
13087
13088  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
13089  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
13090  // consecutive, non-overlapping, and in the right order.
13091  SmallVector<SDValue, 16> Elts;
13092  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
13093    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
13094
13095  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
13096}
13097
13098
13099/// DCI, PerformTruncateCombine - Converts truncate operation to
13100/// a sequence of vector shuffle operations.
13101/// It is possible when we truncate 256-bit vector to 128-bit vector
13102
13103SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
13104                                                  DAGCombinerInfo &DCI) const {
13105  if (!DCI.isBeforeLegalizeOps())
13106    return SDValue();
13107
13108  if (!Subtarget->hasAVX())
13109    return SDValue();
13110
13111  EVT VT = N->getValueType(0);
13112  SDValue Op = N->getOperand(0);
13113  EVT OpVT = Op.getValueType();
13114  DebugLoc dl = N->getDebugLoc();
13115
13116  if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
13117
13118    if (Subtarget->hasAVX2()) {
13119      // AVX2: v4i64 -> v4i32
13120
13121      // VPERMD
13122      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
13123
13124      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
13125      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
13126                                ShufMask);
13127
13128      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
13129                         DAG.getIntPtrConstant(0));
13130    }
13131
13132    // AVX: v4i64 -> v4i32
13133    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
13134                               DAG.getIntPtrConstant(0));
13135
13136    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
13137                               DAG.getIntPtrConstant(2));
13138
13139    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
13140    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
13141
13142    // PSHUFD
13143    static const int ShufMask1[] = {0, 2, 0, 0};
13144
13145    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1);
13146    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1);
13147
13148    // MOVLHPS
13149    static const int ShufMask2[] = {0, 1, 4, 5};
13150
13151    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
13152  }
13153
13154  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
13155
13156    if (Subtarget->hasAVX2()) {
13157      // AVX2: v8i32 -> v8i16
13158
13159      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
13160
13161      // PSHUFB
13162      SmallVector<SDValue,32> pshufbMask;
13163      for (unsigned i = 0; i < 2; ++i) {
13164        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
13165        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
13166        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
13167        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
13168        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
13169        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
13170        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
13171        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
13172        for (unsigned j = 0; j < 8; ++j)
13173          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
13174      }
13175      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
13176                               &pshufbMask[0], 32);
13177      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
13178
13179      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
13180
13181      static const int ShufMask[] = {0,  2,  -1,  -1};
13182      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
13183                                &ShufMask[0]);
13184
13185      Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
13186                       DAG.getIntPtrConstant(0));
13187
13188      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
13189    }
13190
13191    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
13192                               DAG.getIntPtrConstant(0));
13193
13194    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
13195                               DAG.getIntPtrConstant(4));
13196
13197    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
13198    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
13199
13200    // PSHUFB
13201    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
13202                                   -1, -1, -1, -1, -1, -1, -1, -1};
13203
13204    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8),
13205                                ShufMask1);
13206    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8),
13207                                ShufMask1);
13208
13209    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
13210    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
13211
13212    // MOVLHPS
13213    static const int ShufMask2[] = {0, 1, 4, 5};
13214
13215    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
13216    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
13217  }
13218
13219  return SDValue();
13220}
13221
13222/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
13223/// specific shuffle of a load can be folded into a single element load.
13224/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
13225/// shuffles have been customed lowered so we need to handle those here.
13226static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
13227                                         TargetLowering::DAGCombinerInfo &DCI) {
13228  if (DCI.isBeforeLegalizeOps())
13229    return SDValue();
13230
13231  SDValue InVec = N->getOperand(0);
13232  SDValue EltNo = N->getOperand(1);
13233
13234  if (!isa<ConstantSDNode>(EltNo))
13235    return SDValue();
13236
13237  EVT VT = InVec.getValueType();
13238
13239  bool HasShuffleIntoBitcast = false;
13240  if (InVec.getOpcode() == ISD::BITCAST) {
13241    // Don't duplicate a load with other uses.
13242    if (!InVec.hasOneUse())
13243      return SDValue();
13244    EVT BCVT = InVec.getOperand(0).getValueType();
13245    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
13246      return SDValue();
13247    InVec = InVec.getOperand(0);
13248    HasShuffleIntoBitcast = true;
13249  }
13250
13251  if (!isTargetShuffle(InVec.getOpcode()))
13252    return SDValue();
13253
13254  // Don't duplicate a load with other uses.
13255  if (!InVec.hasOneUse())
13256    return SDValue();
13257
13258  SmallVector<int, 16> ShuffleMask;
13259  bool UnaryShuffle;
13260  if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
13261    return SDValue();
13262
13263  // Select the input vector, guarding against out of range extract vector.
13264  unsigned NumElems = VT.getVectorNumElements();
13265  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
13266  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
13267  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
13268                                         : InVec.getOperand(1);
13269
13270  // If inputs to shuffle are the same for both ops, then allow 2 uses
13271  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
13272
13273  if (LdNode.getOpcode() == ISD::BITCAST) {
13274    // Don't duplicate a load with other uses.
13275    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
13276      return SDValue();
13277
13278    AllowedUses = 1; // only allow 1 load use if we have a bitcast
13279    LdNode = LdNode.getOperand(0);
13280  }
13281
13282  if (!ISD::isNormalLoad(LdNode.getNode()))
13283    return SDValue();
13284
13285  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
13286
13287  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
13288    return SDValue();
13289
13290  if (HasShuffleIntoBitcast) {
13291    // If there's a bitcast before the shuffle, check if the load type and
13292    // alignment is valid.
13293    unsigned Align = LN0->getAlignment();
13294    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13295    unsigned NewAlign = TLI.getTargetData()->
13296      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
13297
13298    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
13299      return SDValue();
13300  }
13301
13302  // All checks match so transform back to vector_shuffle so that DAG combiner
13303  // can finish the job
13304  DebugLoc dl = N->getDebugLoc();
13305
13306  // Create shuffle node taking into account the case that its a unary shuffle
13307  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
13308  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
13309                                 InVec.getOperand(0), Shuffle,
13310                                 &ShuffleMask[0]);
13311  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13312  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
13313                     EltNo);
13314}
13315
13316/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
13317/// generation and convert it from being a bunch of shuffles and extracts
13318/// to a simple store and scalar loads to extract the elements.
13319static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
13320                                         TargetLowering::DAGCombinerInfo &DCI) {
13321  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
13322  if (NewOp.getNode())
13323    return NewOp;
13324
13325  SDValue InputVector = N->getOperand(0);
13326
13327  // Only operate on vectors of 4 elements, where the alternative shuffling
13328  // gets to be more expensive.
13329  if (InputVector.getValueType() != MVT::v4i32)
13330    return SDValue();
13331
13332  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
13333  // single use which is a sign-extend or zero-extend, and all elements are
13334  // used.
13335  SmallVector<SDNode *, 4> Uses;
13336  unsigned ExtractedElements = 0;
13337  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
13338       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
13339    if (UI.getUse().getResNo() != InputVector.getResNo())
13340      return SDValue();
13341
13342    SDNode *Extract = *UI;
13343    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13344      return SDValue();
13345
13346    if (Extract->getValueType(0) != MVT::i32)
13347      return SDValue();
13348    if (!Extract->hasOneUse())
13349      return SDValue();
13350    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
13351        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
13352      return SDValue();
13353    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
13354      return SDValue();
13355
13356    // Record which element was extracted.
13357    ExtractedElements |=
13358      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
13359
13360    Uses.push_back(Extract);
13361  }
13362
13363  // If not all the elements were used, this may not be worthwhile.
13364  if (ExtractedElements != 15)
13365    return SDValue();
13366
13367  // Ok, we've now decided to do the transformation.
13368  DebugLoc dl = InputVector.getDebugLoc();
13369
13370  // Store the value to a temporary stack slot.
13371  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
13372  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
13373                            MachinePointerInfo(), false, false, 0);
13374
13375  // Replace each use (extract) with a load of the appropriate element.
13376  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
13377       UE = Uses.end(); UI != UE; ++UI) {
13378    SDNode *Extract = *UI;
13379
13380    // cOMpute the element's address.
13381    SDValue Idx = Extract->getOperand(1);
13382    unsigned EltSize =
13383        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
13384    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
13385    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13386    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
13387
13388    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
13389                                     StackPtr, OffsetVal);
13390
13391    // Load the scalar.
13392    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
13393                                     ScalarAddr, MachinePointerInfo(),
13394                                     false, false, false, 0);
13395
13396    // Replace the exact with the load.
13397    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
13398  }
13399
13400  // The replacement was made in place; don't return anything.
13401  return SDValue();
13402}
13403
13404/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
13405/// nodes.
13406static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
13407                                    TargetLowering::DAGCombinerInfo &DCI,
13408                                    const X86Subtarget *Subtarget) {
13409
13410
13411  DebugLoc DL = N->getDebugLoc();
13412  SDValue Cond = N->getOperand(0);
13413  // Get the LHS/RHS of the select.
13414  SDValue LHS = N->getOperand(1);
13415  SDValue RHS = N->getOperand(2);
13416  EVT VT = LHS.getValueType();
13417
13418  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
13419  // instructions match the semantics of the common C idiom x<y?x:y but not
13420  // x<=y?x:y, because of how they handle negative zero (which can be
13421  // ignored in unsafe-math mode).
13422  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
13423      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
13424      (Subtarget->hasSSE2() ||
13425       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
13426    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
13427
13428    unsigned Opcode = 0;
13429    // Check for x CC y ? x : y.
13430    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
13431        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
13432      switch (CC) {
13433      default: break;
13434      case ISD::SETULT:
13435        // Converting this to a min would handle NaNs incorrectly, and swapping
13436        // the operands would cause it to handle comparisons between positive
13437        // and negative zero incorrectly.
13438        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
13439          if (!DAG.getTarget().Options.UnsafeFPMath &&
13440              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
13441            break;
13442          std::swap(LHS, RHS);
13443        }
13444        Opcode = X86ISD::FMIN;
13445        break;
13446      case ISD::SETOLE:
13447        // Converting this to a min would handle comparisons between positive
13448        // and negative zero incorrectly.
13449        if (!DAG.getTarget().Options.UnsafeFPMath &&
13450            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
13451          break;
13452        Opcode = X86ISD::FMIN;
13453        break;
13454      case ISD::SETULE:
13455        // Converting this to a min would handle both negative zeros and NaNs
13456        // incorrectly, but we can swap the operands to fix both.
13457        std::swap(LHS, RHS);
13458      case ISD::SETOLT:
13459      case ISD::SETLT:
13460      case ISD::SETLE:
13461        Opcode = X86ISD::FMIN;
13462        break;
13463
13464      case ISD::SETOGE:
13465        // Converting this to a max would handle comparisons between positive
13466        // and negative zero incorrectly.
13467        if (!DAG.getTarget().Options.UnsafeFPMath &&
13468            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
13469          break;
13470        Opcode = X86ISD::FMAX;
13471        break;
13472      case ISD::SETUGT:
13473        // Converting this to a max would handle NaNs incorrectly, and swapping
13474        // the operands would cause it to handle comparisons between positive
13475        // and negative zero incorrectly.
13476        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
13477          if (!DAG.getTarget().Options.UnsafeFPMath &&
13478              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
13479            break;
13480          std::swap(LHS, RHS);
13481        }
13482        Opcode = X86ISD::FMAX;
13483        break;
13484      case ISD::SETUGE:
13485        // Converting this to a max would handle both negative zeros and NaNs
13486        // incorrectly, but we can swap the operands to fix both.
13487        std::swap(LHS, RHS);
13488      case ISD::SETOGT:
13489      case ISD::SETGT:
13490      case ISD::SETGE:
13491        Opcode = X86ISD::FMAX;
13492        break;
13493      }
13494    // Check for x CC y ? y : x -- a min/max with reversed arms.
13495    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
13496               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
13497      switch (CC) {
13498      default: break;
13499      case ISD::SETOGE:
13500        // Converting this to a min would handle comparisons between positive
13501        // and negative zero incorrectly, and swapping the operands would
13502        // cause it to handle NaNs incorrectly.
13503        if (!DAG.getTarget().Options.UnsafeFPMath &&
13504            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
13505          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
13506            break;
13507          std::swap(LHS, RHS);
13508        }
13509        Opcode = X86ISD::FMIN;
13510        break;
13511      case ISD::SETUGT:
13512        // Converting this to a min would handle NaNs incorrectly.
13513        if (!DAG.getTarget().Options.UnsafeFPMath &&
13514            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
13515          break;
13516        Opcode = X86ISD::FMIN;
13517        break;
13518      case ISD::SETUGE:
13519        // Converting this to a min would handle both negative zeros and NaNs
13520        // incorrectly, but we can swap the operands to fix both.
13521        std::swap(LHS, RHS);
13522      case ISD::SETOGT:
13523      case ISD::SETGT:
13524      case ISD::SETGE:
13525        Opcode = X86ISD::FMIN;
13526        break;
13527
13528      case ISD::SETULT:
13529        // Converting this to a max would handle NaNs incorrectly.
13530        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
13531          break;
13532        Opcode = X86ISD::FMAX;
13533        break;
13534      case ISD::SETOLE:
13535        // Converting this to a max would handle comparisons between positive
13536        // and negative zero incorrectly, and swapping the operands would
13537        // cause it to handle NaNs incorrectly.
13538        if (!DAG.getTarget().Options.UnsafeFPMath &&
13539            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
13540          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
13541            break;
13542          std::swap(LHS, RHS);
13543        }
13544        Opcode = X86ISD::FMAX;
13545        break;
13546      case ISD::SETULE:
13547        // Converting this to a max would handle both negative zeros and NaNs
13548        // incorrectly, but we can swap the operands to fix both.
13549        std::swap(LHS, RHS);
13550      case ISD::SETOLT:
13551      case ISD::SETLT:
13552      case ISD::SETLE:
13553        Opcode = X86ISD::FMAX;
13554        break;
13555      }
13556    }
13557
13558    if (Opcode)
13559      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
13560  }
13561
13562  // If this is a select between two integer constants, try to do some
13563  // optimizations.
13564  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
13565    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
13566      // Don't do this for crazy integer types.
13567      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
13568        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
13569        // so that TrueC (the true value) is larger than FalseC.
13570        bool NeedsCondInvert = false;
13571
13572        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
13573            // Efficiently invertible.
13574            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
13575             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
13576              isa<ConstantSDNode>(Cond.getOperand(1))))) {
13577          NeedsCondInvert = true;
13578          std::swap(TrueC, FalseC);
13579        }
13580
13581        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
13582        if (FalseC->getAPIntValue() == 0 &&
13583            TrueC->getAPIntValue().isPowerOf2()) {
13584          if (NeedsCondInvert) // Invert the condition if needed.
13585            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
13586                               DAG.getConstant(1, Cond.getValueType()));
13587
13588          // Zero extend the condition if needed.
13589          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
13590
13591          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
13592          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
13593                             DAG.getConstant(ShAmt, MVT::i8));
13594        }
13595
13596        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
13597        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
13598          if (NeedsCondInvert) // Invert the condition if needed.
13599            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
13600                               DAG.getConstant(1, Cond.getValueType()));
13601
13602          // Zero extend the condition if needed.
13603          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
13604                             FalseC->getValueType(0), Cond);
13605          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
13606                             SDValue(FalseC, 0));
13607        }
13608
13609        // Optimize cases that will turn into an LEA instruction.  This requires
13610        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
13611        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
13612          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
13613          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
13614
13615          bool isFastMultiplier = false;
13616          if (Diff < 10) {
13617            switch ((unsigned char)Diff) {
13618              default: break;
13619              case 1:  // result = add base, cond
13620              case 2:  // result = lea base(    , cond*2)
13621              case 3:  // result = lea base(cond, cond*2)
13622              case 4:  // result = lea base(    , cond*4)
13623              case 5:  // result = lea base(cond, cond*4)
13624              case 8:  // result = lea base(    , cond*8)
13625              case 9:  // result = lea base(cond, cond*8)
13626                isFastMultiplier = true;
13627                break;
13628            }
13629          }
13630
13631          if (isFastMultiplier) {
13632            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
13633            if (NeedsCondInvert) // Invert the condition if needed.
13634              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
13635                                 DAG.getConstant(1, Cond.getValueType()));
13636
13637            // Zero extend the condition if needed.
13638            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
13639                               Cond);
13640            // Scale the condition by the difference.
13641            if (Diff != 1)
13642              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
13643                                 DAG.getConstant(Diff, Cond.getValueType()));
13644
13645            // Add the base if non-zero.
13646            if (FalseC->getAPIntValue() != 0)
13647              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
13648                                 SDValue(FalseC, 0));
13649            return Cond;
13650          }
13651        }
13652      }
13653  }
13654
13655  // Canonicalize max and min:
13656  // (x > y) ? x : y -> (x >= y) ? x : y
13657  // (x < y) ? x : y -> (x <= y) ? x : y
13658  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
13659  // the need for an extra compare
13660  // against zero. e.g.
13661  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
13662  // subl   %esi, %edi
13663  // testl  %edi, %edi
13664  // movl   $0, %eax
13665  // cmovgl %edi, %eax
13666  // =>
13667  // xorl   %eax, %eax
13668  // subl   %esi, $edi
13669  // cmovsl %eax, %edi
13670  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
13671      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
13672      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
13673    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
13674    switch (CC) {
13675    default: break;
13676    case ISD::SETLT:
13677    case ISD::SETGT: {
13678      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
13679      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
13680                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
13681      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
13682    }
13683    }
13684  }
13685
13686  // If we know that this node is legal then we know that it is going to be
13687  // matched by one of the SSE/AVX BLEND instructions. These instructions only
13688  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
13689  // to simplify previous instructions.
13690  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13691  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
13692      !DCI.isBeforeLegalize() &&
13693      TLI.isOperationLegal(ISD::VSELECT, VT)) {
13694    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
13695    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
13696    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
13697
13698    APInt KnownZero, KnownOne;
13699    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
13700                                          DCI.isBeforeLegalizeOps());
13701    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
13702        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
13703      DCI.CommitTargetLoweringOpt(TLO);
13704  }
13705
13706  return SDValue();
13707}
13708
13709/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
13710static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
13711                                  TargetLowering::DAGCombinerInfo &DCI) {
13712  DebugLoc DL = N->getDebugLoc();
13713
13714  // If the flag operand isn't dead, don't touch this CMOV.
13715  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
13716    return SDValue();
13717
13718  SDValue FalseOp = N->getOperand(0);
13719  SDValue TrueOp = N->getOperand(1);
13720  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
13721  SDValue Cond = N->getOperand(3);
13722  if (CC == X86::COND_E || CC == X86::COND_NE) {
13723    switch (Cond.getOpcode()) {
13724    default: break;
13725    case X86ISD::BSR:
13726    case X86ISD::BSF:
13727      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
13728      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
13729        return (CC == X86::COND_E) ? FalseOp : TrueOp;
13730    }
13731  }
13732
13733  // If this is a select between two integer constants, try to do some
13734  // optimizations.  Note that the operands are ordered the opposite of SELECT
13735  // operands.
13736  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
13737    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
13738      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
13739      // larger than FalseC (the false value).
13740      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
13741        CC = X86::GetOppositeBranchCondition(CC);
13742        std::swap(TrueC, FalseC);
13743      }
13744
13745      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
13746      // This is efficient for any integer data type (including i8/i16) and
13747      // shift amount.
13748      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
13749        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
13750                           DAG.getConstant(CC, MVT::i8), Cond);
13751
13752        // Zero extend the condition if needed.
13753        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
13754
13755        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
13756        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
13757                           DAG.getConstant(ShAmt, MVT::i8));
13758        if (N->getNumValues() == 2)  // Dead flag value?
13759          return DCI.CombineTo(N, Cond, SDValue());
13760        return Cond;
13761      }
13762
13763      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
13764      // for any integer data type, including i8/i16.
13765      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
13766        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
13767                           DAG.getConstant(CC, MVT::i8), Cond);
13768
13769        // Zero extend the condition if needed.
13770        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
13771                           FalseC->getValueType(0), Cond);
13772        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
13773                           SDValue(FalseC, 0));
13774
13775        if (N->getNumValues() == 2)  // Dead flag value?
13776          return DCI.CombineTo(N, Cond, SDValue());
13777        return Cond;
13778      }
13779
13780      // Optimize cases that will turn into an LEA instruction.  This requires
13781      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
13782      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
13783        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
13784        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
13785
13786        bool isFastMultiplier = false;
13787        if (Diff < 10) {
13788          switch ((unsigned char)Diff) {
13789          default: break;
13790          case 1:  // result = add base, cond
13791          case 2:  // result = lea base(    , cond*2)
13792          case 3:  // result = lea base(cond, cond*2)
13793          case 4:  // result = lea base(    , cond*4)
13794          case 5:  // result = lea base(cond, cond*4)
13795          case 8:  // result = lea base(    , cond*8)
13796          case 9:  // result = lea base(cond, cond*8)
13797            isFastMultiplier = true;
13798            break;
13799          }
13800        }
13801
13802        if (isFastMultiplier) {
13803          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
13804          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
13805                             DAG.getConstant(CC, MVT::i8), Cond);
13806          // Zero extend the condition if needed.
13807          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
13808                             Cond);
13809          // Scale the condition by the difference.
13810          if (Diff != 1)
13811            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
13812                               DAG.getConstant(Diff, Cond.getValueType()));
13813
13814          // Add the base if non-zero.
13815          if (FalseC->getAPIntValue() != 0)
13816            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
13817                               SDValue(FalseC, 0));
13818          if (N->getNumValues() == 2)  // Dead flag value?
13819            return DCI.CombineTo(N, Cond, SDValue());
13820          return Cond;
13821        }
13822      }
13823    }
13824  }
13825  return SDValue();
13826}
13827
13828
13829/// PerformMulCombine - Optimize a single multiply with constant into two
13830/// in order to implement it with two cheaper instructions, e.g.
13831/// LEA + SHL, LEA + LEA.
13832static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
13833                                 TargetLowering::DAGCombinerInfo &DCI) {
13834  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13835    return SDValue();
13836
13837  EVT VT = N->getValueType(0);
13838  if (VT != MVT::i64)
13839    return SDValue();
13840
13841  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
13842  if (!C)
13843    return SDValue();
13844  uint64_t MulAmt = C->getZExtValue();
13845  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
13846    return SDValue();
13847
13848  uint64_t MulAmt1 = 0;
13849  uint64_t MulAmt2 = 0;
13850  if ((MulAmt % 9) == 0) {
13851    MulAmt1 = 9;
13852    MulAmt2 = MulAmt / 9;
13853  } else if ((MulAmt % 5) == 0) {
13854    MulAmt1 = 5;
13855    MulAmt2 = MulAmt / 5;
13856  } else if ((MulAmt % 3) == 0) {
13857    MulAmt1 = 3;
13858    MulAmt2 = MulAmt / 3;
13859  }
13860  if (MulAmt2 &&
13861      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
13862    DebugLoc DL = N->getDebugLoc();
13863
13864    if (isPowerOf2_64(MulAmt2) &&
13865        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
13866      // If second multiplifer is pow2, issue it first. We want the multiply by
13867      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
13868      // is an add.
13869      std::swap(MulAmt1, MulAmt2);
13870
13871    SDValue NewMul;
13872    if (isPowerOf2_64(MulAmt1))
13873      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
13874                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
13875    else
13876      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
13877                           DAG.getConstant(MulAmt1, VT));
13878
13879    if (isPowerOf2_64(MulAmt2))
13880      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
13881                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
13882    else
13883      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
13884                           DAG.getConstant(MulAmt2, VT));
13885
13886    // Do not add new nodes to DAG combiner worklist.
13887    DCI.CombineTo(N, NewMul, false);
13888  }
13889  return SDValue();
13890}
13891
13892static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
13893  SDValue N0 = N->getOperand(0);
13894  SDValue N1 = N->getOperand(1);
13895  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
13896  EVT VT = N0.getValueType();
13897
13898  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
13899  // since the result of setcc_c is all zero's or all ones.
13900  if (VT.isInteger() && !VT.isVector() &&
13901      N1C && N0.getOpcode() == ISD::AND &&
13902      N0.getOperand(1).getOpcode() == ISD::Constant) {
13903    SDValue N00 = N0.getOperand(0);
13904    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
13905        ((N00.getOpcode() == ISD::ANY_EXTEND ||
13906          N00.getOpcode() == ISD::ZERO_EXTEND) &&
13907         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
13908      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
13909      APInt ShAmt = N1C->getAPIntValue();
13910      Mask = Mask.shl(ShAmt);
13911      if (Mask != 0)
13912        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
13913                           N00, DAG.getConstant(Mask, VT));
13914    }
13915  }
13916
13917
13918  // Hardware support for vector shifts is sparse which makes us scalarize the
13919  // vector operations in many cases. Also, on sandybridge ADD is faster than
13920  // shl.
13921  // (shl V, 1) -> add V,V
13922  if (isSplatVector(N1.getNode())) {
13923    assert(N0.getValueType().isVector() && "Invalid vector shift type");
13924    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
13925    // We shift all of the values by one. In many cases we do not have
13926    // hardware support for this operation. This is better expressed as an ADD
13927    // of two values.
13928    if (N1C && (1 == N1C->getZExtValue())) {
13929      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
13930    }
13931  }
13932
13933  return SDValue();
13934}
13935
13936/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
13937///                       when possible.
13938static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
13939                                   TargetLowering::DAGCombinerInfo &DCI,
13940                                   const X86Subtarget *Subtarget) {
13941  EVT VT = N->getValueType(0);
13942  if (N->getOpcode() == ISD::SHL) {
13943    SDValue V = PerformSHLCombine(N, DAG);
13944    if (V.getNode()) return V;
13945  }
13946
13947  // On X86 with SSE2 support, we can transform this to a vector shift if
13948  // all elements are shifted by the same amount.  We can't do this in legalize
13949  // because the a constant vector is typically transformed to a constant pool
13950  // so we have no knowledge of the shift amount.
13951  if (!Subtarget->hasSSE2())
13952    return SDValue();
13953
13954  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
13955      (!Subtarget->hasAVX2() ||
13956       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
13957    return SDValue();
13958
13959  SDValue ShAmtOp = N->getOperand(1);
13960  EVT EltVT = VT.getVectorElementType();
13961  DebugLoc DL = N->getDebugLoc();
13962  SDValue BaseShAmt = SDValue();
13963  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
13964    unsigned NumElts = VT.getVectorNumElements();
13965    unsigned i = 0;
13966    for (; i != NumElts; ++i) {
13967      SDValue Arg = ShAmtOp.getOperand(i);
13968      if (Arg.getOpcode() == ISD::UNDEF) continue;
13969      BaseShAmt = Arg;
13970      break;
13971    }
13972    // Handle the case where the build_vector is all undef
13973    // FIXME: Should DAG allow this?
13974    if (i == NumElts)
13975      return SDValue();
13976
13977    for (; i != NumElts; ++i) {
13978      SDValue Arg = ShAmtOp.getOperand(i);
13979      if (Arg.getOpcode() == ISD::UNDEF) continue;
13980      if (Arg != BaseShAmt) {
13981        return SDValue();
13982      }
13983    }
13984  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
13985             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
13986    SDValue InVec = ShAmtOp.getOperand(0);
13987    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
13988      unsigned NumElts = InVec.getValueType().getVectorNumElements();
13989      unsigned i = 0;
13990      for (; i != NumElts; ++i) {
13991        SDValue Arg = InVec.getOperand(i);
13992        if (Arg.getOpcode() == ISD::UNDEF) continue;
13993        BaseShAmt = Arg;
13994        break;
13995      }
13996    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
13997       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
13998         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
13999         if (C->getZExtValue() == SplatIdx)
14000           BaseShAmt = InVec.getOperand(1);
14001       }
14002    }
14003    if (BaseShAmt.getNode() == 0) {
14004      // Don't create instructions with illegal types after legalize
14005      // types has run.
14006      if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
14007          !DCI.isBeforeLegalize())
14008        return SDValue();
14009
14010      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
14011                              DAG.getIntPtrConstant(0));
14012    }
14013  } else
14014    return SDValue();
14015
14016  // The shift amount is an i32.
14017  if (EltVT.bitsGT(MVT::i32))
14018    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
14019  else if (EltVT.bitsLT(MVT::i32))
14020    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
14021
14022  // The shift amount is identical so we can do a vector shift.
14023  SDValue  ValOp = N->getOperand(0);
14024  switch (N->getOpcode()) {
14025  default:
14026    llvm_unreachable("Unknown shift opcode!");
14027  case ISD::SHL:
14028    switch (VT.getSimpleVT().SimpleTy) {
14029    default: return SDValue();
14030    case MVT::v2i64:
14031    case MVT::v4i32:
14032    case MVT::v8i16:
14033    case MVT::v4i64:
14034    case MVT::v8i32:
14035    case MVT::v16i16:
14036      return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
14037    }
14038  case ISD::SRA:
14039    switch (VT.getSimpleVT().SimpleTy) {
14040    default: return SDValue();
14041    case MVT::v4i32:
14042    case MVT::v8i16:
14043    case MVT::v8i32:
14044    case MVT::v16i16:
14045      return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
14046    }
14047  case ISD::SRL:
14048    switch (VT.getSimpleVT().SimpleTy) {
14049    default: return SDValue();
14050    case MVT::v2i64:
14051    case MVT::v4i32:
14052    case MVT::v8i16:
14053    case MVT::v4i64:
14054    case MVT::v8i32:
14055    case MVT::v16i16:
14056      return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
14057    }
14058  }
14059}
14060
14061
14062// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
14063// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
14064// and friends.  Likewise for OR -> CMPNEQSS.
14065static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
14066                            TargetLowering::DAGCombinerInfo &DCI,
14067                            const X86Subtarget *Subtarget) {
14068  unsigned opcode;
14069
14070  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
14071  // we're requiring SSE2 for both.
14072  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
14073    SDValue N0 = N->getOperand(0);
14074    SDValue N1 = N->getOperand(1);
14075    SDValue CMP0 = N0->getOperand(1);
14076    SDValue CMP1 = N1->getOperand(1);
14077    DebugLoc DL = N->getDebugLoc();
14078
14079    // The SETCCs should both refer to the same CMP.
14080    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
14081      return SDValue();
14082
14083    SDValue CMP00 = CMP0->getOperand(0);
14084    SDValue CMP01 = CMP0->getOperand(1);
14085    EVT     VT    = CMP00.getValueType();
14086
14087    if (VT == MVT::f32 || VT == MVT::f64) {
14088      bool ExpectingFlags = false;
14089      // Check for any users that want flags:
14090      for (SDNode::use_iterator UI = N->use_begin(),
14091             UE = N->use_end();
14092           !ExpectingFlags && UI != UE; ++UI)
14093        switch (UI->getOpcode()) {
14094        default:
14095        case ISD::BR_CC:
14096        case ISD::BRCOND:
14097        case ISD::SELECT:
14098          ExpectingFlags = true;
14099          break;
14100        case ISD::CopyToReg:
14101        case ISD::SIGN_EXTEND:
14102        case ISD::ZERO_EXTEND:
14103        case ISD::ANY_EXTEND:
14104          break;
14105        }
14106
14107      if (!ExpectingFlags) {
14108        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
14109        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
14110
14111        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
14112          X86::CondCode tmp = cc0;
14113          cc0 = cc1;
14114          cc1 = tmp;
14115        }
14116
14117        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
14118            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
14119          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
14120          X86ISD::NodeType NTOperator = is64BitFP ?
14121            X86ISD::FSETCCsd : X86ISD::FSETCCss;
14122          // FIXME: need symbolic constants for these magic numbers.
14123          // See X86ATTInstPrinter.cpp:printSSECC().
14124          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
14125          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
14126                                              DAG.getConstant(x86cc, MVT::i8));
14127          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
14128                                              OnesOrZeroesF);
14129          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
14130                                      DAG.getConstant(1, MVT::i32));
14131          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
14132          return OneBitOfTruth;
14133        }
14134      }
14135    }
14136  }
14137  return SDValue();
14138}
14139
14140/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
14141/// so it can be folded inside ANDNP.
14142static bool CanFoldXORWithAllOnes(const SDNode *N) {
14143  EVT VT = N->getValueType(0);
14144
14145  // Match direct AllOnes for 128 and 256-bit vectors
14146  if (ISD::isBuildVectorAllOnes(N))
14147    return true;
14148
14149  // Look through a bit convert.
14150  if (N->getOpcode() == ISD::BITCAST)
14151    N = N->getOperand(0).getNode();
14152
14153  // Sometimes the operand may come from a insert_subvector building a 256-bit
14154  // allones vector
14155  if (VT.getSizeInBits() == 256 &&
14156      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
14157    SDValue V1 = N->getOperand(0);
14158    SDValue V2 = N->getOperand(1);
14159
14160    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
14161        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
14162        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
14163        ISD::isBuildVectorAllOnes(V2.getNode()))
14164      return true;
14165  }
14166
14167  return false;
14168}
14169
14170static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
14171                                 TargetLowering::DAGCombinerInfo &DCI,
14172                                 const X86Subtarget *Subtarget) {
14173  if (DCI.isBeforeLegalizeOps())
14174    return SDValue();
14175
14176  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
14177  if (R.getNode())
14178    return R;
14179
14180  EVT VT = N->getValueType(0);
14181
14182  // Create ANDN, BLSI, and BLSR instructions
14183  // BLSI is X & (-X)
14184  // BLSR is X & (X-1)
14185  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
14186    SDValue N0 = N->getOperand(0);
14187    SDValue N1 = N->getOperand(1);
14188    DebugLoc DL = N->getDebugLoc();
14189
14190    // Check LHS for not
14191    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
14192      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
14193    // Check RHS for not
14194    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
14195      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
14196
14197    // Check LHS for neg
14198    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
14199        isZero(N0.getOperand(0)))
14200      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
14201
14202    // Check RHS for neg
14203    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
14204        isZero(N1.getOperand(0)))
14205      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
14206
14207    // Check LHS for X-1
14208    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
14209        isAllOnes(N0.getOperand(1)))
14210      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
14211
14212    // Check RHS for X-1
14213    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
14214        isAllOnes(N1.getOperand(1)))
14215      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
14216
14217    return SDValue();
14218  }
14219
14220  // Want to form ANDNP nodes:
14221  // 1) In the hopes of then easily combining them with OR and AND nodes
14222  //    to form PBLEND/PSIGN.
14223  // 2) To match ANDN packed intrinsics
14224  if (VT != MVT::v2i64 && VT != MVT::v4i64)
14225    return SDValue();
14226
14227  SDValue N0 = N->getOperand(0);
14228  SDValue N1 = N->getOperand(1);
14229  DebugLoc DL = N->getDebugLoc();
14230
14231  // Check LHS for vnot
14232  if (N0.getOpcode() == ISD::XOR &&
14233      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
14234      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
14235    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
14236
14237  // Check RHS for vnot
14238  if (N1.getOpcode() == ISD::XOR &&
14239      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
14240      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
14241    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
14242
14243  return SDValue();
14244}
14245
14246static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
14247                                TargetLowering::DAGCombinerInfo &DCI,
14248                                const X86Subtarget *Subtarget) {
14249  if (DCI.isBeforeLegalizeOps())
14250    return SDValue();
14251
14252  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
14253  if (R.getNode())
14254    return R;
14255
14256  EVT VT = N->getValueType(0);
14257
14258  SDValue N0 = N->getOperand(0);
14259  SDValue N1 = N->getOperand(1);
14260
14261  // look for psign/blend
14262  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
14263    if (!Subtarget->hasSSSE3() ||
14264        (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
14265      return SDValue();
14266
14267    // Canonicalize pandn to RHS
14268    if (N0.getOpcode() == X86ISD::ANDNP)
14269      std::swap(N0, N1);
14270    // or (and (m, y), (pandn m, x))
14271    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
14272      SDValue Mask = N1.getOperand(0);
14273      SDValue X    = N1.getOperand(1);
14274      SDValue Y;
14275      if (N0.getOperand(0) == Mask)
14276        Y = N0.getOperand(1);
14277      if (N0.getOperand(1) == Mask)
14278        Y = N0.getOperand(0);
14279
14280      // Check to see if the mask appeared in both the AND and ANDNP and
14281      if (!Y.getNode())
14282        return SDValue();
14283
14284      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
14285      // Look through mask bitcast.
14286      if (Mask.getOpcode() == ISD::BITCAST)
14287        Mask = Mask.getOperand(0);
14288      if (X.getOpcode() == ISD::BITCAST)
14289        X = X.getOperand(0);
14290      if (Y.getOpcode() == ISD::BITCAST)
14291        Y = Y.getOperand(0);
14292
14293      EVT MaskVT = Mask.getValueType();
14294
14295      // Validate that the Mask operand is a vector sra node.
14296      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
14297      // there is no psrai.b
14298      if (Mask.getOpcode() != X86ISD::VSRAI)
14299        return SDValue();
14300
14301      // Check that the SRA is all signbits.
14302      SDValue SraC = Mask.getOperand(1);
14303      unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
14304      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
14305      if ((SraAmt + 1) != EltBits)
14306        return SDValue();
14307
14308      DebugLoc DL = N->getDebugLoc();
14309
14310      // Now we know we at least have a plendvb with the mask val.  See if
14311      // we can form a psignb/w/d.
14312      // psign = x.type == y.type == mask.type && y = sub(0, x);
14313      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
14314          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
14315          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
14316        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
14317               "Unsupported VT for PSIGN");
14318        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
14319        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
14320      }
14321      // PBLENDVB only available on SSE 4.1
14322      if (!Subtarget->hasSSE41())
14323        return SDValue();
14324
14325      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
14326
14327      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
14328      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
14329      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
14330      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
14331      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
14332    }
14333  }
14334
14335  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
14336    return SDValue();
14337
14338  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
14339  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
14340    std::swap(N0, N1);
14341  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
14342    return SDValue();
14343  if (!N0.hasOneUse() || !N1.hasOneUse())
14344    return SDValue();
14345
14346  SDValue ShAmt0 = N0.getOperand(1);
14347  if (ShAmt0.getValueType() != MVT::i8)
14348    return SDValue();
14349  SDValue ShAmt1 = N1.getOperand(1);
14350  if (ShAmt1.getValueType() != MVT::i8)
14351    return SDValue();
14352  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
14353    ShAmt0 = ShAmt0.getOperand(0);
14354  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
14355    ShAmt1 = ShAmt1.getOperand(0);
14356
14357  DebugLoc DL = N->getDebugLoc();
14358  unsigned Opc = X86ISD::SHLD;
14359  SDValue Op0 = N0.getOperand(0);
14360  SDValue Op1 = N1.getOperand(0);
14361  if (ShAmt0.getOpcode() == ISD::SUB) {
14362    Opc = X86ISD::SHRD;
14363    std::swap(Op0, Op1);
14364    std::swap(ShAmt0, ShAmt1);
14365  }
14366
14367  unsigned Bits = VT.getSizeInBits();
14368  if (ShAmt1.getOpcode() == ISD::SUB) {
14369    SDValue Sum = ShAmt1.getOperand(0);
14370    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
14371      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
14372      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
14373        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
14374      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
14375        return DAG.getNode(Opc, DL, VT,
14376                           Op0, Op1,
14377                           DAG.getNode(ISD::TRUNCATE, DL,
14378                                       MVT::i8, ShAmt0));
14379    }
14380  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
14381    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
14382    if (ShAmt0C &&
14383        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
14384      return DAG.getNode(Opc, DL, VT,
14385                         N0.getOperand(0), N1.getOperand(0),
14386                         DAG.getNode(ISD::TRUNCATE, DL,
14387                                       MVT::i8, ShAmt0));
14388  }
14389
14390  return SDValue();
14391}
14392
14393// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
14394static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
14395                                 TargetLowering::DAGCombinerInfo &DCI,
14396                                 const X86Subtarget *Subtarget) {
14397  if (DCI.isBeforeLegalizeOps())
14398    return SDValue();
14399
14400  EVT VT = N->getValueType(0);
14401
14402  if (VT != MVT::i32 && VT != MVT::i64)
14403    return SDValue();
14404
14405  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
14406
14407  // Create BLSMSK instructions by finding X ^ (X-1)
14408  SDValue N0 = N->getOperand(0);
14409  SDValue N1 = N->getOperand(1);
14410  DebugLoc DL = N->getDebugLoc();
14411
14412  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
14413      isAllOnes(N0.getOperand(1)))
14414    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
14415
14416  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
14417      isAllOnes(N1.getOperand(1)))
14418    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
14419
14420  return SDValue();
14421}
14422
14423/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
14424static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
14425                                   const X86Subtarget *Subtarget) {
14426  LoadSDNode *Ld = cast<LoadSDNode>(N);
14427  EVT RegVT = Ld->getValueType(0);
14428  EVT MemVT = Ld->getMemoryVT();
14429  DebugLoc dl = Ld->getDebugLoc();
14430  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14431
14432  ISD::LoadExtType Ext = Ld->getExtensionType();
14433
14434  // If this is a vector EXT Load then attempt to optimize it using a
14435  // shuffle. We need SSE4 for the shuffles.
14436  // TODO: It is possible to support ZExt by zeroing the undef values
14437  // during the shuffle phase or after the shuffle.
14438  if (RegVT.isVector() && RegVT.isInteger() &&
14439      Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
14440    assert(MemVT != RegVT && "Cannot extend to the same type");
14441    assert(MemVT.isVector() && "Must load a vector from memory");
14442
14443    unsigned NumElems = RegVT.getVectorNumElements();
14444    unsigned RegSz = RegVT.getSizeInBits();
14445    unsigned MemSz = MemVT.getSizeInBits();
14446    assert(RegSz > MemSz && "Register size must be greater than the mem size");
14447    // All sizes must be a power of two
14448    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
14449
14450    // Attempt to load the original value using a single load op.
14451    // Find a scalar type which is equal to the loaded word size.
14452    MVT SclrLoadTy = MVT::i8;
14453    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
14454         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
14455      MVT Tp = (MVT::SimpleValueType)tp;
14456      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
14457        SclrLoadTy = Tp;
14458        break;
14459      }
14460    }
14461
14462    // Proceed if a load word is found.
14463    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
14464
14465    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
14466      RegSz/SclrLoadTy.getSizeInBits());
14467
14468    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
14469                                  RegSz/MemVT.getScalarType().getSizeInBits());
14470    // Can't shuffle using an illegal type.
14471    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
14472
14473    // Perform a single load.
14474    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
14475                                  Ld->getBasePtr(),
14476                                  Ld->getPointerInfo(), Ld->isVolatile(),
14477                                  Ld->isNonTemporal(), Ld->isInvariant(),
14478                                  Ld->getAlignment());
14479
14480    // Insert the word loaded into a vector.
14481    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14482      LoadUnitVecVT, ScalarLoad);
14483
14484    // Bitcast the loaded value to a vector of the original element type, in
14485    // the size of the target vector type.
14486    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
14487                                    ScalarInVector);
14488    unsigned SizeRatio = RegSz/MemSz;
14489
14490    // Redistribute the loaded elements into the different locations.
14491    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
14492    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
14493
14494    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
14495                                         DAG.getUNDEF(WideVecVT),
14496                                         &ShuffleVec[0]);
14497
14498    // Bitcast to the requested type.
14499    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
14500    // Replace the original load with the new sequence
14501    // and return the new chain.
14502    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
14503    return SDValue(ScalarLoad.getNode(), 1);
14504  }
14505
14506  return SDValue();
14507}
14508
14509/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
14510static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
14511                                   const X86Subtarget *Subtarget) {
14512  StoreSDNode *St = cast<StoreSDNode>(N);
14513  EVT VT = St->getValue().getValueType();
14514  EVT StVT = St->getMemoryVT();
14515  DebugLoc dl = St->getDebugLoc();
14516  SDValue StoredVal = St->getOperand(1);
14517  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14518
14519  // If we are saving a concatenation of two XMM registers, perform two stores.
14520  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
14521  // 128-bit ones. If in the future the cost becomes only one memory access the
14522  // first version would be better.
14523  if (VT.getSizeInBits() == 256 &&
14524      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
14525      StoredVal.getNumOperands() == 2) {
14526
14527    SDValue Value0 = StoredVal.getOperand(0);
14528    SDValue Value1 = StoredVal.getOperand(1);
14529
14530    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
14531    SDValue Ptr0 = St->getBasePtr();
14532    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
14533
14534    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
14535                                St->getPointerInfo(), St->isVolatile(),
14536                                St->isNonTemporal(), St->getAlignment());
14537    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
14538                                St->getPointerInfo(), St->isVolatile(),
14539                                St->isNonTemporal(), St->getAlignment());
14540    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
14541  }
14542
14543  // Optimize trunc store (of multiple scalars) to shuffle and store.
14544  // First, pack all of the elements in one place. Next, store to memory
14545  // in fewer chunks.
14546  if (St->isTruncatingStore() && VT.isVector()) {
14547    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14548    unsigned NumElems = VT.getVectorNumElements();
14549    assert(StVT != VT && "Cannot truncate to the same type");
14550    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
14551    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
14552
14553    // From, To sizes and ElemCount must be pow of two
14554    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
14555    // We are going to use the original vector elt for storing.
14556    // Accumulated smaller vector elements must be a multiple of the store size.
14557    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
14558
14559    unsigned SizeRatio  = FromSz / ToSz;
14560
14561    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
14562
14563    // Create a type on which we perform the shuffle
14564    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
14565            StVT.getScalarType(), NumElems*SizeRatio);
14566
14567    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
14568
14569    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
14570    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
14571    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
14572
14573    // Can't shuffle using an illegal type
14574    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
14575
14576    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
14577                                         DAG.getUNDEF(WideVecVT),
14578                                         &ShuffleVec[0]);
14579    // At this point all of the data is stored at the bottom of the
14580    // register. We now need to save it to mem.
14581
14582    // Find the largest store unit
14583    MVT StoreType = MVT::i8;
14584    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
14585         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
14586      MVT Tp = (MVT::SimpleValueType)tp;
14587      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
14588        StoreType = Tp;
14589    }
14590
14591    // Bitcast the original vector into a vector of store-size units
14592    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
14593            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
14594    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
14595    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
14596    SmallVector<SDValue, 8> Chains;
14597    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
14598                                        TLI.getPointerTy());
14599    SDValue Ptr = St->getBasePtr();
14600
14601    // Perform one or more big stores into memory.
14602    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
14603      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14604                                   StoreType, ShuffWide,
14605                                   DAG.getIntPtrConstant(i));
14606      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
14607                                St->getPointerInfo(), St->isVolatile(),
14608                                St->isNonTemporal(), St->getAlignment());
14609      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
14610      Chains.push_back(Ch);
14611    }
14612
14613    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
14614                               Chains.size());
14615  }
14616
14617
14618  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
14619  // the FP state in cases where an emms may be missing.
14620  // A preferable solution to the general problem is to figure out the right
14621  // places to insert EMMS.  This qualifies as a quick hack.
14622
14623  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
14624  if (VT.getSizeInBits() != 64)
14625    return SDValue();
14626
14627  const Function *F = DAG.getMachineFunction().getFunction();
14628  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
14629  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
14630                     && Subtarget->hasSSE2();
14631  if ((VT.isVector() ||
14632       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
14633      isa<LoadSDNode>(St->getValue()) &&
14634      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
14635      St->getChain().hasOneUse() && !St->isVolatile()) {
14636    SDNode* LdVal = St->getValue().getNode();
14637    LoadSDNode *Ld = 0;
14638    int TokenFactorIndex = -1;
14639    SmallVector<SDValue, 8> Ops;
14640    SDNode* ChainVal = St->getChain().getNode();
14641    // Must be a store of a load.  We currently handle two cases:  the load
14642    // is a direct child, and it's under an intervening TokenFactor.  It is
14643    // possible to dig deeper under nested TokenFactors.
14644    if (ChainVal == LdVal)
14645      Ld = cast<LoadSDNode>(St->getChain());
14646    else if (St->getValue().hasOneUse() &&
14647             ChainVal->getOpcode() == ISD::TokenFactor) {
14648      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
14649        if (ChainVal->getOperand(i).getNode() == LdVal) {
14650          TokenFactorIndex = i;
14651          Ld = cast<LoadSDNode>(St->getValue());
14652        } else
14653          Ops.push_back(ChainVal->getOperand(i));
14654      }
14655    }
14656
14657    if (!Ld || !ISD::isNormalLoad(Ld))
14658      return SDValue();
14659
14660    // If this is not the MMX case, i.e. we are just turning i64 load/store
14661    // into f64 load/store, avoid the transformation if there are multiple
14662    // uses of the loaded value.
14663    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
14664      return SDValue();
14665
14666    DebugLoc LdDL = Ld->getDebugLoc();
14667    DebugLoc StDL = N->getDebugLoc();
14668    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
14669    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
14670    // pair instead.
14671    if (Subtarget->is64Bit() || F64IsLegal) {
14672      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
14673      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
14674                                  Ld->getPointerInfo(), Ld->isVolatile(),
14675                                  Ld->isNonTemporal(), Ld->isInvariant(),
14676                                  Ld->getAlignment());
14677      SDValue NewChain = NewLd.getValue(1);
14678      if (TokenFactorIndex != -1) {
14679        Ops.push_back(NewChain);
14680        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
14681                               Ops.size());
14682      }
14683      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
14684                          St->getPointerInfo(),
14685                          St->isVolatile(), St->isNonTemporal(),
14686                          St->getAlignment());
14687    }
14688
14689    // Otherwise, lower to two pairs of 32-bit loads / stores.
14690    SDValue LoAddr = Ld->getBasePtr();
14691    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
14692                                 DAG.getConstant(4, MVT::i32));
14693
14694    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
14695                               Ld->getPointerInfo(),
14696                               Ld->isVolatile(), Ld->isNonTemporal(),
14697                               Ld->isInvariant(), Ld->getAlignment());
14698    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
14699                               Ld->getPointerInfo().getWithOffset(4),
14700                               Ld->isVolatile(), Ld->isNonTemporal(),
14701                               Ld->isInvariant(),
14702                               MinAlign(Ld->getAlignment(), 4));
14703
14704    SDValue NewChain = LoLd.getValue(1);
14705    if (TokenFactorIndex != -1) {
14706      Ops.push_back(LoLd);
14707      Ops.push_back(HiLd);
14708      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
14709                             Ops.size());
14710    }
14711
14712    LoAddr = St->getBasePtr();
14713    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
14714                         DAG.getConstant(4, MVT::i32));
14715
14716    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
14717                                St->getPointerInfo(),
14718                                St->isVolatile(), St->isNonTemporal(),
14719                                St->getAlignment());
14720    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
14721                                St->getPointerInfo().getWithOffset(4),
14722                                St->isVolatile(),
14723                                St->isNonTemporal(),
14724                                MinAlign(St->getAlignment(), 4));
14725    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
14726  }
14727  return SDValue();
14728}
14729
14730/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
14731/// and return the operands for the horizontal operation in LHS and RHS.  A
14732/// horizontal operation performs the binary operation on successive elements
14733/// of its first operand, then on successive elements of its second operand,
14734/// returning the resulting values in a vector.  For example, if
14735///   A = < float a0, float a1, float a2, float a3 >
14736/// and
14737///   B = < float b0, float b1, float b2, float b3 >
14738/// then the result of doing a horizontal operation on A and B is
14739///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
14740/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
14741/// A horizontal-op B, for some already available A and B, and if so then LHS is
14742/// set to A, RHS to B, and the routine returns 'true'.
14743/// Note that the binary operation should have the property that if one of the
14744/// operands is UNDEF then the result is UNDEF.
14745static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
14746  // Look for the following pattern: if
14747  //   A = < float a0, float a1, float a2, float a3 >
14748  //   B = < float b0, float b1, float b2, float b3 >
14749  // and
14750  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
14751  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
14752  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
14753  // which is A horizontal-op B.
14754
14755  // At least one of the operands should be a vector shuffle.
14756  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
14757      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
14758    return false;
14759
14760  EVT VT = LHS.getValueType();
14761
14762  assert((VT.is128BitVector() || VT.is256BitVector()) &&
14763         "Unsupported vector type for horizontal add/sub");
14764
14765  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
14766  // operate independently on 128-bit lanes.
14767  unsigned NumElts = VT.getVectorNumElements();
14768  unsigned NumLanes = VT.getSizeInBits()/128;
14769  unsigned NumLaneElts = NumElts / NumLanes;
14770  assert((NumLaneElts % 2 == 0) &&
14771         "Vector type should have an even number of elements in each lane");
14772  unsigned HalfLaneElts = NumLaneElts/2;
14773
14774  // View LHS in the form
14775  //   LHS = VECTOR_SHUFFLE A, B, LMask
14776  // If LHS is not a shuffle then pretend it is the shuffle
14777  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
14778  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
14779  // type VT.
14780  SDValue A, B;
14781  SmallVector<int, 16> LMask(NumElts);
14782  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
14783    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
14784      A = LHS.getOperand(0);
14785    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
14786      B = LHS.getOperand(1);
14787    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
14788    std::copy(Mask.begin(), Mask.end(), LMask.begin());
14789  } else {
14790    if (LHS.getOpcode() != ISD::UNDEF)
14791      A = LHS;
14792    for (unsigned i = 0; i != NumElts; ++i)
14793      LMask[i] = i;
14794  }
14795
14796  // Likewise, view RHS in the form
14797  //   RHS = VECTOR_SHUFFLE C, D, RMask
14798  SDValue C, D;
14799  SmallVector<int, 16> RMask(NumElts);
14800  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
14801    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
14802      C = RHS.getOperand(0);
14803    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
14804      D = RHS.getOperand(1);
14805    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
14806    std::copy(Mask.begin(), Mask.end(), RMask.begin());
14807  } else {
14808    if (RHS.getOpcode() != ISD::UNDEF)
14809      C = RHS;
14810    for (unsigned i = 0; i != NumElts; ++i)
14811      RMask[i] = i;
14812  }
14813
14814  // Check that the shuffles are both shuffling the same vectors.
14815  if (!(A == C && B == D) && !(A == D && B == C))
14816    return false;
14817
14818  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
14819  if (!A.getNode() && !B.getNode())
14820    return false;
14821
14822  // If A and B occur in reverse order in RHS, then "swap" them (which means
14823  // rewriting the mask).
14824  if (A != C)
14825    CommuteVectorShuffleMask(RMask, NumElts);
14826
14827  // At this point LHS and RHS are equivalent to
14828  //   LHS = VECTOR_SHUFFLE A, B, LMask
14829  //   RHS = VECTOR_SHUFFLE A, B, RMask
14830  // Check that the masks correspond to performing a horizontal operation.
14831  for (unsigned i = 0; i != NumElts; ++i) {
14832    int LIdx = LMask[i], RIdx = RMask[i];
14833
14834    // Ignore any UNDEF components.
14835    if (LIdx < 0 || RIdx < 0 ||
14836        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
14837        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
14838      continue;
14839
14840    // Check that successive elements are being operated on.  If not, this is
14841    // not a horizontal operation.
14842    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
14843    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
14844    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
14845    if (!(LIdx == Index && RIdx == Index + 1) &&
14846        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
14847      return false;
14848  }
14849
14850  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
14851  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
14852  return true;
14853}
14854
14855/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
14856static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
14857                                  const X86Subtarget *Subtarget) {
14858  EVT VT = N->getValueType(0);
14859  SDValue LHS = N->getOperand(0);
14860  SDValue RHS = N->getOperand(1);
14861
14862  // Try to synthesize horizontal adds from adds of shuffles.
14863  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
14864       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
14865      isHorizontalBinOp(LHS, RHS, true))
14866    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
14867  return SDValue();
14868}
14869
14870/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
14871static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
14872                                  const X86Subtarget *Subtarget) {
14873  EVT VT = N->getValueType(0);
14874  SDValue LHS = N->getOperand(0);
14875  SDValue RHS = N->getOperand(1);
14876
14877  // Try to synthesize horizontal subs from subs of shuffles.
14878  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
14879       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
14880      isHorizontalBinOp(LHS, RHS, false))
14881    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
14882  return SDValue();
14883}
14884
14885/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
14886/// X86ISD::FXOR nodes.
14887static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
14888  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
14889  // F[X]OR(0.0, x) -> x
14890  // F[X]OR(x, 0.0) -> x
14891  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
14892    if (C->getValueAPF().isPosZero())
14893      return N->getOperand(1);
14894  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
14895    if (C->getValueAPF().isPosZero())
14896      return N->getOperand(0);
14897  return SDValue();
14898}
14899
14900/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
14901static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
14902  // FAND(0.0, x) -> 0.0
14903  // FAND(x, 0.0) -> 0.0
14904  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
14905    if (C->getValueAPF().isPosZero())
14906      return N->getOperand(0);
14907  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
14908    if (C->getValueAPF().isPosZero())
14909      return N->getOperand(1);
14910  return SDValue();
14911}
14912
14913static SDValue PerformBTCombine(SDNode *N,
14914                                SelectionDAG &DAG,
14915                                TargetLowering::DAGCombinerInfo &DCI) {
14916  // BT ignores high bits in the bit index operand.
14917  SDValue Op1 = N->getOperand(1);
14918  if (Op1.hasOneUse()) {
14919    unsigned BitWidth = Op1.getValueSizeInBits();
14920    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
14921    APInt KnownZero, KnownOne;
14922    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
14923                                          !DCI.isBeforeLegalizeOps());
14924    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14925    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
14926        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
14927      DCI.CommitTargetLoweringOpt(TLO);
14928  }
14929  return SDValue();
14930}
14931
14932static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
14933  SDValue Op = N->getOperand(0);
14934  if (Op.getOpcode() == ISD::BITCAST)
14935    Op = Op.getOperand(0);
14936  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
14937  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
14938      VT.getVectorElementType().getSizeInBits() ==
14939      OpVT.getVectorElementType().getSizeInBits()) {
14940    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
14941  }
14942  return SDValue();
14943}
14944
14945static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
14946                                  TargetLowering::DAGCombinerInfo &DCI,
14947                                  const X86Subtarget *Subtarget) {
14948  if (!DCI.isBeforeLegalizeOps())
14949    return SDValue();
14950
14951  if (!Subtarget->hasAVX())
14952    return SDValue();
14953
14954  EVT VT = N->getValueType(0);
14955  SDValue Op = N->getOperand(0);
14956  EVT OpVT = Op.getValueType();
14957  DebugLoc dl = N->getDebugLoc();
14958
14959  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
14960      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
14961
14962    if (Subtarget->hasAVX2())
14963      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
14964
14965    // Optimize vectors in AVX mode
14966    // Sign extend  v8i16 to v8i32 and
14967    //              v4i32 to v4i64
14968    //
14969    // Divide input vector into two parts
14970    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
14971    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
14972    // concat the vectors to original VT
14973
14974    unsigned NumElems = OpVT.getVectorNumElements();
14975    SmallVector<int,8> ShufMask1(NumElems, -1);
14976    for (unsigned i = 0; i != NumElems/2; ++i)
14977      ShufMask1[i] = i;
14978
14979    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
14980                                        &ShufMask1[0]);
14981
14982    SmallVector<int,8> ShufMask2(NumElems, -1);
14983    for (unsigned i = 0; i != NumElems/2; ++i)
14984      ShufMask2[i] = i + NumElems/2;
14985
14986    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
14987                                        &ShufMask2[0]);
14988
14989    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
14990                                  VT.getVectorNumElements()/2);
14991
14992    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
14993    OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
14994
14995    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14996  }
14997  return SDValue();
14998}
14999
15000static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
15001                                  TargetLowering::DAGCombinerInfo &DCI,
15002                                  const X86Subtarget *Subtarget) {
15003  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
15004  //           (and (i32 x86isd::setcc_carry), 1)
15005  // This eliminates the zext. This transformation is necessary because
15006  // ISD::SETCC is always legalized to i8.
15007  DebugLoc dl = N->getDebugLoc();
15008  SDValue N0 = N->getOperand(0);
15009  EVT VT = N->getValueType(0);
15010  EVT OpVT = N0.getValueType();
15011
15012  if (N0.getOpcode() == ISD::AND &&
15013      N0.hasOneUse() &&
15014      N0.getOperand(0).hasOneUse()) {
15015    SDValue N00 = N0.getOperand(0);
15016    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
15017      return SDValue();
15018    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
15019    if (!C || C->getZExtValue() != 1)
15020      return SDValue();
15021    return DAG.getNode(ISD::AND, dl, VT,
15022                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
15023                                   N00.getOperand(0), N00.getOperand(1)),
15024                       DAG.getConstant(1, VT));
15025  }
15026
15027  // Optimize vectors in AVX mode:
15028  //
15029  //   v8i16 -> v8i32
15030  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
15031  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
15032  //   Concat upper and lower parts.
15033  //
15034  //   v4i32 -> v4i64
15035  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
15036  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
15037  //   Concat upper and lower parts.
15038  //
15039  if (!DCI.isBeforeLegalizeOps())
15040    return SDValue();
15041
15042  if (!Subtarget->hasAVX())
15043    return SDValue();
15044
15045  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
15046      ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
15047
15048    if (Subtarget->hasAVX2())
15049      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
15050
15051    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
15052    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
15053    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
15054
15055    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
15056                               VT.getVectorNumElements()/2);
15057
15058    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
15059    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
15060
15061    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15062  }
15063
15064  return SDValue();
15065}
15066
15067// Optimize x == -y --> x+y == 0
15068//          x != -y --> x+y != 0
15069static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
15070  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15071  SDValue LHS = N->getOperand(0);
15072  SDValue RHS = N->getOperand(1);
15073
15074  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
15075    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
15076      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
15077        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
15078                                   LHS.getValueType(), RHS, LHS.getOperand(1));
15079        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
15080                            addV, DAG.getConstant(0, addV.getValueType()), CC);
15081      }
15082  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
15083    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
15084      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
15085        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
15086                                   RHS.getValueType(), LHS, RHS.getOperand(1));
15087        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
15088                            addV, DAG.getConstant(0, addV.getValueType()), CC);
15089      }
15090  return SDValue();
15091}
15092
15093// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
15094static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
15095  unsigned X86CC = N->getConstantOperandVal(0);
15096  SDValue EFLAG = N->getOperand(1);
15097  DebugLoc DL = N->getDebugLoc();
15098
15099  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
15100  // a zext and produces an all-ones bit which is more useful than 0/1 in some
15101  // cases.
15102  if (X86CC == X86::COND_B)
15103    return DAG.getNode(ISD::AND, DL, MVT::i8,
15104                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
15105                                   DAG.getConstant(X86CC, MVT::i8), EFLAG),
15106                       DAG.getConstant(1, MVT::i8));
15107
15108  return SDValue();
15109}
15110
15111static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
15112  SDValue Op0 = N->getOperand(0);
15113  EVT InVT = Op0->getValueType(0);
15114
15115  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
15116  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
15117    DebugLoc dl = N->getDebugLoc();
15118    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
15119    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
15120    // Notice that we use SINT_TO_FP because we know that the high bits
15121    // are zero and SINT_TO_FP is better supported by the hardware.
15122    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
15123  }
15124
15125  return SDValue();
15126}
15127
15128static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
15129                                        const X86TargetLowering *XTLI) {
15130  SDValue Op0 = N->getOperand(0);
15131  EVT InVT = Op0->getValueType(0);
15132
15133  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
15134  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
15135    DebugLoc dl = N->getDebugLoc();
15136    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
15137    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
15138    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
15139  }
15140
15141  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
15142  // a 32-bit target where SSE doesn't support i64->FP operations.
15143  if (Op0.getOpcode() == ISD::LOAD) {
15144    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
15145    EVT VT = Ld->getValueType(0);
15146    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
15147        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
15148        !XTLI->getSubtarget()->is64Bit() &&
15149        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
15150      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
15151                                          Ld->getChain(), Op0, DAG);
15152      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
15153      return FILDChain;
15154    }
15155  }
15156  return SDValue();
15157}
15158
15159static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
15160  EVT VT = N->getValueType(0);
15161
15162  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
15163  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
15164    DebugLoc dl = N->getDebugLoc();
15165    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
15166    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
15167    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
15168  }
15169
15170  return SDValue();
15171}
15172
15173// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
15174static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
15175                                 X86TargetLowering::DAGCombinerInfo &DCI) {
15176  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
15177  // the result is either zero or one (depending on the input carry bit).
15178  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
15179  if (X86::isZeroNode(N->getOperand(0)) &&
15180      X86::isZeroNode(N->getOperand(1)) &&
15181      // We don't have a good way to replace an EFLAGS use, so only do this when
15182      // dead right now.
15183      SDValue(N, 1).use_empty()) {
15184    DebugLoc DL = N->getDebugLoc();
15185    EVT VT = N->getValueType(0);
15186    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
15187    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
15188                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
15189                                           DAG.getConstant(X86::COND_B,MVT::i8),
15190                                           N->getOperand(2)),
15191                               DAG.getConstant(1, VT));
15192    return DCI.CombineTo(N, Res1, CarryOut);
15193  }
15194
15195  return SDValue();
15196}
15197
15198// fold (add Y, (sete  X, 0)) -> adc  0, Y
15199//      (add Y, (setne X, 0)) -> sbb -1, Y
15200//      (sub (sete  X, 0), Y) -> sbb  0, Y
15201//      (sub (setne X, 0), Y) -> adc -1, Y
15202static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
15203  DebugLoc DL = N->getDebugLoc();
15204
15205  // Look through ZExts.
15206  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
15207  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
15208    return SDValue();
15209
15210  SDValue SetCC = Ext.getOperand(0);
15211  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
15212    return SDValue();
15213
15214  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
15215  if (CC != X86::COND_E && CC != X86::COND_NE)
15216    return SDValue();
15217
15218  SDValue Cmp = SetCC.getOperand(1);
15219  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
15220      !X86::isZeroNode(Cmp.getOperand(1)) ||
15221      !Cmp.getOperand(0).getValueType().isInteger())
15222    return SDValue();
15223
15224  SDValue CmpOp0 = Cmp.getOperand(0);
15225  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
15226                               DAG.getConstant(1, CmpOp0.getValueType()));
15227
15228  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
15229  if (CC == X86::COND_NE)
15230    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
15231                       DL, OtherVal.getValueType(), OtherVal,
15232                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
15233  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
15234                     DL, OtherVal.getValueType(), OtherVal,
15235                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
15236}
15237
15238/// PerformADDCombine - Do target-specific dag combines on integer adds.
15239static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
15240                                 const X86Subtarget *Subtarget) {
15241  EVT VT = N->getValueType(0);
15242  SDValue Op0 = N->getOperand(0);
15243  SDValue Op1 = N->getOperand(1);
15244
15245  // Try to synthesize horizontal adds from adds of shuffles.
15246  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
15247       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
15248      isHorizontalBinOp(Op0, Op1, true))
15249    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
15250
15251  return OptimizeConditionalInDecrement(N, DAG);
15252}
15253
15254static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
15255                                 const X86Subtarget *Subtarget) {
15256  SDValue Op0 = N->getOperand(0);
15257  SDValue Op1 = N->getOperand(1);
15258
15259  // X86 can't encode an immediate LHS of a sub. See if we can push the
15260  // negation into a preceding instruction.
15261  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
15262    // If the RHS of the sub is a XOR with one use and a constant, invert the
15263    // immediate. Then add one to the LHS of the sub so we can turn
15264    // X-Y -> X+~Y+1, saving one register.
15265    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
15266        isa<ConstantSDNode>(Op1.getOperand(1))) {
15267      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
15268      EVT VT = Op0.getValueType();
15269      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
15270                                   Op1.getOperand(0),
15271                                   DAG.getConstant(~XorC, VT));
15272      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
15273                         DAG.getConstant(C->getAPIntValue()+1, VT));
15274    }
15275  }
15276
15277  // Try to synthesize horizontal adds from adds of shuffles.
15278  EVT VT = N->getValueType(0);
15279  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
15280       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
15281      isHorizontalBinOp(Op0, Op1, true))
15282    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
15283
15284  return OptimizeConditionalInDecrement(N, DAG);
15285}
15286
15287SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
15288                                             DAGCombinerInfo &DCI) const {
15289  SelectionDAG &DAG = DCI.DAG;
15290  switch (N->getOpcode()) {
15291  default: break;
15292  case ISD::EXTRACT_VECTOR_ELT:
15293    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
15294  case ISD::VSELECT:
15295  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
15296  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
15297  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
15298  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
15299  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
15300  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
15301  case ISD::SHL:
15302  case ISD::SRA:
15303  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
15304  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
15305  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
15306  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
15307  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
15308  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
15309  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
15310  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
15311  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
15312  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
15313  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
15314  case X86ISD::FXOR:
15315  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
15316  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
15317  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
15318  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
15319  case ISD::ANY_EXTEND:
15320  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
15321  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
15322  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
15323  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
15324  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
15325  case X86ISD::SHUFP:       // Handle all target specific shuffles
15326  case X86ISD::PALIGN:
15327  case X86ISD::UNPCKH:
15328  case X86ISD::UNPCKL:
15329  case X86ISD::MOVHLPS:
15330  case X86ISD::MOVLHPS:
15331  case X86ISD::PSHUFD:
15332  case X86ISD::PSHUFHW:
15333  case X86ISD::PSHUFLW:
15334  case X86ISD::MOVSS:
15335  case X86ISD::MOVSD:
15336  case X86ISD::VPERMILP:
15337  case X86ISD::VPERM2X128:
15338  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
15339  }
15340
15341  return SDValue();
15342}
15343
15344/// isTypeDesirableForOp - Return true if the target has native support for
15345/// the specified value type and it is 'desirable' to use the type for the
15346/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
15347/// instruction encodings are longer and some i16 instructions are slow.
15348bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
15349  if (!isTypeLegal(VT))
15350    return false;
15351  if (VT != MVT::i16)
15352    return true;
15353
15354  switch (Opc) {
15355  default:
15356    return true;
15357  case ISD::LOAD:
15358  case ISD::SIGN_EXTEND:
15359  case ISD::ZERO_EXTEND:
15360  case ISD::ANY_EXTEND:
15361  case ISD::SHL:
15362  case ISD::SRL:
15363  case ISD::SUB:
15364  case ISD::ADD:
15365  case ISD::MUL:
15366  case ISD::AND:
15367  case ISD::OR:
15368  case ISD::XOR:
15369    return false;
15370  }
15371}
15372
15373/// IsDesirableToPromoteOp - This method query the target whether it is
15374/// beneficial for dag combiner to promote the specified node. If true, it
15375/// should return the desired promotion type by reference.
15376bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
15377  EVT VT = Op.getValueType();
15378  if (VT != MVT::i16)
15379    return false;
15380
15381  bool Promote = false;
15382  bool Commute = false;
15383  switch (Op.getOpcode()) {
15384  default: break;
15385  case ISD::LOAD: {
15386    LoadSDNode *LD = cast<LoadSDNode>(Op);
15387    // If the non-extending load has a single use and it's not live out, then it
15388    // might be folded.
15389    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
15390                                                     Op.hasOneUse()*/) {
15391      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15392             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
15393        // The only case where we'd want to promote LOAD (rather then it being
15394        // promoted as an operand is when it's only use is liveout.
15395        if (UI->getOpcode() != ISD::CopyToReg)
15396          return false;
15397      }
15398    }
15399    Promote = true;
15400    break;
15401  }
15402  case ISD::SIGN_EXTEND:
15403  case ISD::ZERO_EXTEND:
15404  case ISD::ANY_EXTEND:
15405    Promote = true;
15406    break;
15407  case ISD::SHL:
15408  case ISD::SRL: {
15409    SDValue N0 = Op.getOperand(0);
15410    // Look out for (store (shl (load), x)).
15411    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
15412      return false;
15413    Promote = true;
15414    break;
15415  }
15416  case ISD::ADD:
15417  case ISD::MUL:
15418  case ISD::AND:
15419  case ISD::OR:
15420  case ISD::XOR:
15421    Commute = true;
15422    // fallthrough
15423  case ISD::SUB: {
15424    SDValue N0 = Op.getOperand(0);
15425    SDValue N1 = Op.getOperand(1);
15426    if (!Commute && MayFoldLoad(N1))
15427      return false;
15428    // Avoid disabling potential load folding opportunities.
15429    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
15430      return false;
15431    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
15432      return false;
15433    Promote = true;
15434  }
15435  }
15436
15437  PVT = MVT::i32;
15438  return Promote;
15439}
15440
15441//===----------------------------------------------------------------------===//
15442//                           X86 Inline Assembly Support
15443//===----------------------------------------------------------------------===//
15444
15445namespace {
15446  // Helper to match a string separated by whitespace.
15447  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
15448    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
15449
15450    for (unsigned i = 0, e = args.size(); i != e; ++i) {
15451      StringRef piece(*args[i]);
15452      if (!s.startswith(piece)) // Check if the piece matches.
15453        return false;
15454
15455      s = s.substr(piece.size());
15456      StringRef::size_type pos = s.find_first_not_of(" \t");
15457      if (pos == 0) // We matched a prefix.
15458        return false;
15459
15460      s = s.substr(pos);
15461    }
15462
15463    return s.empty();
15464  }
15465  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
15466}
15467
15468bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
15469  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
15470
15471  std::string AsmStr = IA->getAsmString();
15472
15473  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
15474  if (!Ty || Ty->getBitWidth() % 16 != 0)
15475    return false;
15476
15477  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
15478  SmallVector<StringRef, 4> AsmPieces;
15479  SplitString(AsmStr, AsmPieces, ";\n");
15480
15481  switch (AsmPieces.size()) {
15482  default: return false;
15483  case 1:
15484    // FIXME: this should verify that we are targeting a 486 or better.  If not,
15485    // we will turn this bswap into something that will be lowered to logical
15486    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
15487    // lower so don't worry about this.
15488    // bswap $0
15489    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
15490        matchAsm(AsmPieces[0], "bswapl", "$0") ||
15491        matchAsm(AsmPieces[0], "bswapq", "$0") ||
15492        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
15493        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
15494        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
15495      // No need to check constraints, nothing other than the equivalent of
15496      // "=r,0" would be valid here.
15497      return IntrinsicLowering::LowerToByteSwap(CI);
15498    }
15499
15500    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
15501    if (CI->getType()->isIntegerTy(16) &&
15502        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
15503        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
15504         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
15505      AsmPieces.clear();
15506      const std::string &ConstraintsStr = IA->getConstraintString();
15507      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
15508      std::sort(AsmPieces.begin(), AsmPieces.end());
15509      if (AsmPieces.size() == 4 &&
15510          AsmPieces[0] == "~{cc}" &&
15511          AsmPieces[1] == "~{dirflag}" &&
15512          AsmPieces[2] == "~{flags}" &&
15513          AsmPieces[3] == "~{fpsr}")
15514      return IntrinsicLowering::LowerToByteSwap(CI);
15515    }
15516    break;
15517  case 3:
15518    if (CI->getType()->isIntegerTy(32) &&
15519        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
15520        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
15521        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
15522        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
15523      AsmPieces.clear();
15524      const std::string &ConstraintsStr = IA->getConstraintString();
15525      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
15526      std::sort(AsmPieces.begin(), AsmPieces.end());
15527      if (AsmPieces.size() == 4 &&
15528          AsmPieces[0] == "~{cc}" &&
15529          AsmPieces[1] == "~{dirflag}" &&
15530          AsmPieces[2] == "~{flags}" &&
15531          AsmPieces[3] == "~{fpsr}")
15532        return IntrinsicLowering::LowerToByteSwap(CI);
15533    }
15534
15535    if (CI->getType()->isIntegerTy(64)) {
15536      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
15537      if (Constraints.size() >= 2 &&
15538          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
15539          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
15540        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
15541        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
15542            matchAsm(AsmPieces[1], "bswap", "%edx") &&
15543            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
15544          return IntrinsicLowering::LowerToByteSwap(CI);
15545      }
15546    }
15547    break;
15548  }
15549  return false;
15550}
15551
15552
15553
15554/// getConstraintType - Given a constraint letter, return the type of
15555/// constraint it is for this target.
15556X86TargetLowering::ConstraintType
15557X86TargetLowering::getConstraintType(const std::string &Constraint) const {
15558  if (Constraint.size() == 1) {
15559    switch (Constraint[0]) {
15560    case 'R':
15561    case 'q':
15562    case 'Q':
15563    case 'f':
15564    case 't':
15565    case 'u':
15566    case 'y':
15567    case 'x':
15568    case 'Y':
15569    case 'l':
15570      return C_RegisterClass;
15571    case 'a':
15572    case 'b':
15573    case 'c':
15574    case 'd':
15575    case 'S':
15576    case 'D':
15577    case 'A':
15578      return C_Register;
15579    case 'I':
15580    case 'J':
15581    case 'K':
15582    case 'L':
15583    case 'M':
15584    case 'N':
15585    case 'G':
15586    case 'C':
15587    case 'e':
15588    case 'Z':
15589      return C_Other;
15590    default:
15591      break;
15592    }
15593  }
15594  return TargetLowering::getConstraintType(Constraint);
15595}
15596
15597/// Examine constraint type and operand type and determine a weight value.
15598/// This object must already have been set up with the operand type
15599/// and the current alternative constraint selected.
15600TargetLowering::ConstraintWeight
15601  X86TargetLowering::getSingleConstraintMatchWeight(
15602    AsmOperandInfo &info, const char *constraint) const {
15603  ConstraintWeight weight = CW_Invalid;
15604  Value *CallOperandVal = info.CallOperandVal;
15605    // If we don't have a value, we can't do a match,
15606    // but allow it at the lowest weight.
15607  if (CallOperandVal == NULL)
15608    return CW_Default;
15609  Type *type = CallOperandVal->getType();
15610  // Look at the constraint type.
15611  switch (*constraint) {
15612  default:
15613    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
15614  case 'R':
15615  case 'q':
15616  case 'Q':
15617  case 'a':
15618  case 'b':
15619  case 'c':
15620  case 'd':
15621  case 'S':
15622  case 'D':
15623  case 'A':
15624    if (CallOperandVal->getType()->isIntegerTy())
15625      weight = CW_SpecificReg;
15626    break;
15627  case 'f':
15628  case 't':
15629  case 'u':
15630      if (type->isFloatingPointTy())
15631        weight = CW_SpecificReg;
15632      break;
15633  case 'y':
15634      if (type->isX86_MMXTy() && Subtarget->hasMMX())
15635        weight = CW_SpecificReg;
15636      break;
15637  case 'x':
15638  case 'Y':
15639    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
15640        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
15641      weight = CW_Register;
15642    break;
15643  case 'I':
15644    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
15645      if (C->getZExtValue() <= 31)
15646        weight = CW_Constant;
15647    }
15648    break;
15649  case 'J':
15650    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15651      if (C->getZExtValue() <= 63)
15652        weight = CW_Constant;
15653    }
15654    break;
15655  case 'K':
15656    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15657      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
15658        weight = CW_Constant;
15659    }
15660    break;
15661  case 'L':
15662    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15663      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
15664        weight = CW_Constant;
15665    }
15666    break;
15667  case 'M':
15668    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15669      if (C->getZExtValue() <= 3)
15670        weight = CW_Constant;
15671    }
15672    break;
15673  case 'N':
15674    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15675      if (C->getZExtValue() <= 0xff)
15676        weight = CW_Constant;
15677    }
15678    break;
15679  case 'G':
15680  case 'C':
15681    if (dyn_cast<ConstantFP>(CallOperandVal)) {
15682      weight = CW_Constant;
15683    }
15684    break;
15685  case 'e':
15686    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15687      if ((C->getSExtValue() >= -0x80000000LL) &&
15688          (C->getSExtValue() <= 0x7fffffffLL))
15689        weight = CW_Constant;
15690    }
15691    break;
15692  case 'Z':
15693    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
15694      if (C->getZExtValue() <= 0xffffffff)
15695        weight = CW_Constant;
15696    }
15697    break;
15698  }
15699  return weight;
15700}
15701
15702/// LowerXConstraint - try to replace an X constraint, which matches anything,
15703/// with another that has more specific requirements based on the type of the
15704/// corresponding operand.
15705const char *X86TargetLowering::
15706LowerXConstraint(EVT ConstraintVT) const {
15707  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
15708  // 'f' like normal targets.
15709  if (ConstraintVT.isFloatingPoint()) {
15710    if (Subtarget->hasSSE2())
15711      return "Y";
15712    if (Subtarget->hasSSE1())
15713      return "x";
15714  }
15715
15716  return TargetLowering::LowerXConstraint(ConstraintVT);
15717}
15718
15719/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
15720/// vector.  If it is invalid, don't add anything to Ops.
15721void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
15722                                                     std::string &Constraint,
15723                                                     std::vector<SDValue>&Ops,
15724                                                     SelectionDAG &DAG) const {
15725  SDValue Result(0, 0);
15726
15727  // Only support length 1 constraints for now.
15728  if (Constraint.length() > 1) return;
15729
15730  char ConstraintLetter = Constraint[0];
15731  switch (ConstraintLetter) {
15732  default: break;
15733  case 'I':
15734    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15735      if (C->getZExtValue() <= 31) {
15736        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
15737        break;
15738      }
15739    }
15740    return;
15741  case 'J':
15742    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15743      if (C->getZExtValue() <= 63) {
15744        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
15745        break;
15746      }
15747    }
15748    return;
15749  case 'K':
15750    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15751      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
15752        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
15753        break;
15754      }
15755    }
15756    return;
15757  case 'N':
15758    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15759      if (C->getZExtValue() <= 255) {
15760        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
15761        break;
15762      }
15763    }
15764    return;
15765  case 'e': {
15766    // 32-bit signed value
15767    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15768      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
15769                                           C->getSExtValue())) {
15770        // Widen to 64 bits here to get it sign extended.
15771        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
15772        break;
15773      }
15774    // FIXME gcc accepts some relocatable values here too, but only in certain
15775    // memory models; it's complicated.
15776    }
15777    return;
15778  }
15779  case 'Z': {
15780    // 32-bit unsigned value
15781    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15782      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
15783                                           C->getZExtValue())) {
15784        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
15785        break;
15786      }
15787    }
15788    // FIXME gcc accepts some relocatable values here too, but only in certain
15789    // memory models; it's complicated.
15790    return;
15791  }
15792  case 'i': {
15793    // Literal immediates are always ok.
15794    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
15795      // Widen to 64 bits here to get it sign extended.
15796      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
15797      break;
15798    }
15799
15800    // In any sort of PIC mode addresses need to be computed at runtime by
15801    // adding in a register or some sort of table lookup.  These can't
15802    // be used as immediates.
15803    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
15804      return;
15805
15806    // If we are in non-pic codegen mode, we allow the address of a global (with
15807    // an optional displacement) to be used with 'i'.
15808    GlobalAddressSDNode *GA = 0;
15809    int64_t Offset = 0;
15810
15811    // Match either (GA), (GA+C), (GA+C1+C2), etc.
15812    while (1) {
15813      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
15814        Offset += GA->getOffset();
15815        break;
15816      } else if (Op.getOpcode() == ISD::ADD) {
15817        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
15818          Offset += C->getZExtValue();
15819          Op = Op.getOperand(0);
15820          continue;
15821        }
15822      } else if (Op.getOpcode() == ISD::SUB) {
15823        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
15824          Offset += -C->getZExtValue();
15825          Op = Op.getOperand(0);
15826          continue;
15827        }
15828      }
15829
15830      // Otherwise, this isn't something we can handle, reject it.
15831      return;
15832    }
15833
15834    const GlobalValue *GV = GA->getGlobal();
15835    // If we require an extra load to get this address, as in PIC mode, we
15836    // can't accept it.
15837    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
15838                                                        getTargetMachine())))
15839      return;
15840
15841    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
15842                                        GA->getValueType(0), Offset);
15843    break;
15844  }
15845  }
15846
15847  if (Result.getNode()) {
15848    Ops.push_back(Result);
15849    return;
15850  }
15851  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15852}
15853
15854std::pair<unsigned, const TargetRegisterClass*>
15855X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
15856                                                EVT VT) const {
15857  // First, see if this is a constraint that directly corresponds to an LLVM
15858  // register class.
15859  if (Constraint.size() == 1) {
15860    // GCC Constraint Letters
15861    switch (Constraint[0]) {
15862    default: break;
15863      // TODO: Slight differences here in allocation order and leaving
15864      // RIP in the class. Do they matter any more here than they do
15865      // in the normal allocation?
15866    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
15867      if (Subtarget->is64Bit()) {
15868        if (VT == MVT::i32 || VT == MVT::f32)
15869          return std::make_pair(0U, &X86::GR32RegClass);
15870        if (VT == MVT::i16)
15871          return std::make_pair(0U, &X86::GR16RegClass);
15872        if (VT == MVT::i8 || VT == MVT::i1)
15873          return std::make_pair(0U, &X86::GR8RegClass);
15874        if (VT == MVT::i64 || VT == MVT::f64)
15875          return std::make_pair(0U, &X86::GR64RegClass);
15876        break;
15877      }
15878      // 32-bit fallthrough
15879    case 'Q':   // Q_REGS
15880      if (VT == MVT::i32 || VT == MVT::f32)
15881        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
15882      if (VT == MVT::i16)
15883        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
15884      if (VT == MVT::i8 || VT == MVT::i1)
15885        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
15886      if (VT == MVT::i64)
15887        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
15888      break;
15889    case 'r':   // GENERAL_REGS
15890    case 'l':   // INDEX_REGS
15891      if (VT == MVT::i8 || VT == MVT::i1)
15892        return std::make_pair(0U, &X86::GR8RegClass);
15893      if (VT == MVT::i16)
15894        return std::make_pair(0U, &X86::GR16RegClass);
15895      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
15896        return std::make_pair(0U, &X86::GR32RegClass);
15897      return std::make_pair(0U, &X86::GR64RegClass);
15898    case 'R':   // LEGACY_REGS
15899      if (VT == MVT::i8 || VT == MVT::i1)
15900        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
15901      if (VT == MVT::i16)
15902        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
15903      if (VT == MVT::i32 || !Subtarget->is64Bit())
15904        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
15905      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
15906    case 'f':  // FP Stack registers.
15907      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
15908      // value to the correct fpstack register class.
15909      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
15910        return std::make_pair(0U, &X86::RFP32RegClass);
15911      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
15912        return std::make_pair(0U, &X86::RFP64RegClass);
15913      return std::make_pair(0U, &X86::RFP80RegClass);
15914    case 'y':   // MMX_REGS if MMX allowed.
15915      if (!Subtarget->hasMMX()) break;
15916      return std::make_pair(0U, &X86::VR64RegClass);
15917    case 'Y':   // SSE_REGS if SSE2 allowed
15918      if (!Subtarget->hasSSE2()) break;
15919      // FALL THROUGH.
15920    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
15921      if (!Subtarget->hasSSE1()) break;
15922
15923      switch (VT.getSimpleVT().SimpleTy) {
15924      default: break;
15925      // Scalar SSE types.
15926      case MVT::f32:
15927      case MVT::i32:
15928        return std::make_pair(0U, &X86::FR32RegClass);
15929      case MVT::f64:
15930      case MVT::i64:
15931        return std::make_pair(0U, &X86::FR64RegClass);
15932      // Vector types.
15933      case MVT::v16i8:
15934      case MVT::v8i16:
15935      case MVT::v4i32:
15936      case MVT::v2i64:
15937      case MVT::v4f32:
15938      case MVT::v2f64:
15939        return std::make_pair(0U, &X86::VR128RegClass);
15940      // AVX types.
15941      case MVT::v32i8:
15942      case MVT::v16i16:
15943      case MVT::v8i32:
15944      case MVT::v4i64:
15945      case MVT::v8f32:
15946      case MVT::v4f64:
15947        return std::make_pair(0U, &X86::VR256RegClass);
15948      }
15949      break;
15950    }
15951  }
15952
15953  // Use the default implementation in TargetLowering to convert the register
15954  // constraint into a member of a register class.
15955  std::pair<unsigned, const TargetRegisterClass*> Res;
15956  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
15957
15958  // Not found as a standard register?
15959  if (Res.second == 0) {
15960    // Map st(0) -> st(7) -> ST0
15961    if (Constraint.size() == 7 && Constraint[0] == '{' &&
15962        tolower(Constraint[1]) == 's' &&
15963        tolower(Constraint[2]) == 't' &&
15964        Constraint[3] == '(' &&
15965        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
15966        Constraint[5] == ')' &&
15967        Constraint[6] == '}') {
15968
15969      Res.first = X86::ST0+Constraint[4]-'0';
15970      Res.second = &X86::RFP80RegClass;
15971      return Res;
15972    }
15973
15974    // GCC allows "st(0)" to be called just plain "st".
15975    if (StringRef("{st}").equals_lower(Constraint)) {
15976      Res.first = X86::ST0;
15977      Res.second = &X86::RFP80RegClass;
15978      return Res;
15979    }
15980
15981    // flags -> EFLAGS
15982    if (StringRef("{flags}").equals_lower(Constraint)) {
15983      Res.first = X86::EFLAGS;
15984      Res.second = &X86::CCRRegClass;
15985      return Res;
15986    }
15987
15988    // 'A' means EAX + EDX.
15989    if (Constraint == "A") {
15990      Res.first = X86::EAX;
15991      Res.second = &X86::GR32_ADRegClass;
15992      return Res;
15993    }
15994    return Res;
15995  }
15996
15997  // Otherwise, check to see if this is a register class of the wrong value
15998  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
15999  // turn into {ax},{dx}.
16000  if (Res.second->hasType(VT))
16001    return Res;   // Correct type already, nothing to do.
16002
16003  // All of the single-register GCC register classes map their values onto
16004  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
16005  // really want an 8-bit or 32-bit register, map to the appropriate register
16006  // class and return the appropriate register.
16007  if (Res.second == &X86::GR16RegClass) {
16008    if (VT == MVT::i8) {
16009      unsigned DestReg = 0;
16010      switch (Res.first) {
16011      default: break;
16012      case X86::AX: DestReg = X86::AL; break;
16013      case X86::DX: DestReg = X86::DL; break;
16014      case X86::CX: DestReg = X86::CL; break;
16015      case X86::BX: DestReg = X86::BL; break;
16016      }
16017      if (DestReg) {
16018        Res.first = DestReg;
16019        Res.second = &X86::GR8RegClass;
16020      }
16021    } else if (VT == MVT::i32) {
16022      unsigned DestReg = 0;
16023      switch (Res.first) {
16024      default: break;
16025      case X86::AX: DestReg = X86::EAX; break;
16026      case X86::DX: DestReg = X86::EDX; break;
16027      case X86::CX: DestReg = X86::ECX; break;
16028      case X86::BX: DestReg = X86::EBX; break;
16029      case X86::SI: DestReg = X86::ESI; break;
16030      case X86::DI: DestReg = X86::EDI; break;
16031      case X86::BP: DestReg = X86::EBP; break;
16032      case X86::SP: DestReg = X86::ESP; break;
16033      }
16034      if (DestReg) {
16035        Res.first = DestReg;
16036        Res.second = &X86::GR32RegClass;
16037      }
16038    } else if (VT == MVT::i64) {
16039      unsigned DestReg = 0;
16040      switch (Res.first) {
16041      default: break;
16042      case X86::AX: DestReg = X86::RAX; break;
16043      case X86::DX: DestReg = X86::RDX; break;
16044      case X86::CX: DestReg = X86::RCX; break;
16045      case X86::BX: DestReg = X86::RBX; break;
16046      case X86::SI: DestReg = X86::RSI; break;
16047      case X86::DI: DestReg = X86::RDI; break;
16048      case X86::BP: DestReg = X86::RBP; break;
16049      case X86::SP: DestReg = X86::RSP; break;
16050      }
16051      if (DestReg) {
16052        Res.first = DestReg;
16053        Res.second = &X86::GR64RegClass;
16054      }
16055    }
16056  } else if (Res.second == &X86::FR32RegClass ||
16057             Res.second == &X86::FR64RegClass ||
16058             Res.second == &X86::VR128RegClass) {
16059    // Handle references to XMM physical registers that got mapped into the
16060    // wrong class.  This can happen with constraints like {xmm0} where the
16061    // target independent register mapper will just pick the first match it can
16062    // find, ignoring the required type.
16063    if (VT == MVT::f32)
16064      Res.second = &X86::FR32RegClass;
16065    else if (VT == MVT::f64)
16066      Res.second = &X86::FR64RegClass;
16067    else if (X86::VR128RegClass.hasType(VT))
16068      Res.second = &X86::VR128RegClass;
16069  }
16070
16071  return Res;
16072}
16073