X86ISelLowering.cpp revision 71d1bf55a27017fceef25554e02021a3bc47cdb4
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86.h"
16#include "X86InstrBuilder.h"
17#include "X86ISelLowering.h"
18#include "X86MachineFunctionInfo.h"
19#include "X86TargetMachine.h"
20#include "llvm/CallingConv.h"
21#include "llvm/Constants.h"
22#include "llvm/DerivedTypes.h"
23#include "llvm/GlobalVariable.h"
24#include "llvm/Function.h"
25#include "llvm/Intrinsics.h"
26#include "llvm/ADT/BitVector.h"
27#include "llvm/ADT/VectorExtras.h"
28#include "llvm/CodeGen/CallingConvLower.h"
29#include "llvm/CodeGen/MachineFrameInfo.h"
30#include "llvm/CodeGen/MachineFunction.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include "llvm/CodeGen/MachineModuleInfo.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/CodeGen/PseudoSourceValue.h"
35#include "llvm/CodeGen/SelectionDAG.h"
36#include "llvm/Support/MathExtras.h"
37#include "llvm/Support/Debug.h"
38#include "llvm/Target/TargetOptions.h"
39#include "llvm/ADT/SmallSet.h"
40#include "llvm/ADT/StringExtras.h"
41using namespace llvm;
42
43// Forward declarations.
44static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG);
45
46X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
47  : TargetLowering(TM) {
48  Subtarget = &TM.getSubtarget<X86Subtarget>();
49  X86ScalarSSEf64 = Subtarget->hasSSE2();
50  X86ScalarSSEf32 = Subtarget->hasSSE1();
51  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
52
53  bool Fast = false;
54
55  RegInfo = TM.getRegisterInfo();
56  TD = getTargetData();
57
58  // Set up the TargetLowering object.
59
60  // X86 is weird, it always uses i8 for shift amounts and setcc results.
61  setShiftAmountType(MVT::i8);
62  setSetCCResultContents(ZeroOrOneSetCCResult);
63  setSchedulingPreference(SchedulingForRegPressure);
64  setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
65  setStackPointerRegisterToSaveRestore(X86StackPtr);
66
67  if (Subtarget->isTargetDarwin()) {
68    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
69    setUseUnderscoreSetJmp(false);
70    setUseUnderscoreLongJmp(false);
71  } else if (Subtarget->isTargetMingw()) {
72    // MS runtime is weird: it exports _setjmp, but longjmp!
73    setUseUnderscoreSetJmp(true);
74    setUseUnderscoreLongJmp(false);
75  } else {
76    setUseUnderscoreSetJmp(true);
77    setUseUnderscoreLongJmp(true);
78  }
79
80  // Set up the register classes.
81  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
82  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
83  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
84  if (Subtarget->is64Bit())
85    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
86
87  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Promote);
88
89  // We don't accept any truncstore of integer registers.
90  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
91  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
92  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
93  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
94  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
95  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
96
97  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
98  // operation.
99  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
100  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
101  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
102
103  if (Subtarget->is64Bit()) {
104    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
105    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
106  } else {
107    if (X86ScalarSSEf64)
108      // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
109      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Expand);
110    else
111      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
112  }
113
114  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
115  // this operation.
116  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
117  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
118  // SSE has no i16 to fp conversion, only i32
119  if (X86ScalarSSEf32) {
120    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
121    // f32 and f64 cases are Legal, f80 case is not
122    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
123  } else {
124    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
125    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
126  }
127
128  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
129  // are Legal, f80 is custom lowered.
130  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
131  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
132
133  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
134  // this operation.
135  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
136  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
137
138  if (X86ScalarSSEf32) {
139    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
140    // f32 and f64 cases are Legal, f80 case is not
141    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
142  } else {
143    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
144    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
145  }
146
147  // Handle FP_TO_UINT by promoting the destination to a larger signed
148  // conversion.
149  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
150  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
151  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
152
153  if (Subtarget->is64Bit()) {
154    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
155    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
156  } else {
157    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
158      // Expand FP_TO_UINT into a select.
159      // FIXME: We would like to use a Custom expander here eventually to do
160      // the optimal thing for SSE vs. the default expansion in the legalizer.
161      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
162    else
163      // With SSE3 we can use fisttpll to convert to a signed i64.
164      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
165  }
166
167  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
168  if (!X86ScalarSSEf64) {
169    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
170    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
171  }
172
173  // Scalar integer divide and remainder are lowered to use operations that
174  // produce two results, to match the available instructions. This exposes
175  // the two-result form to trivial CSE, which is able to combine x/y and x%y
176  // into a single instruction.
177  //
178  // Scalar integer multiply-high is also lowered to use two-result
179  // operations, to match the available instructions. However, plain multiply
180  // (low) operations are left as Legal, as there are single-result
181  // instructions for this in x86. Using the two-result multiply instructions
182  // when both high and low results are needed must be arranged by dagcombine.
183  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
184  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
185  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
186  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
187  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
188  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
189  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
190  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
191  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
192  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
193  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
194  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
195  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
196  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
197  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
198  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
199  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
200  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
201  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
202  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
203  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
204  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
205  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
206  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
207
208  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
209  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
210  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
211  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
212  if (Subtarget->is64Bit())
213    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
214  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
215  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
216  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
217  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
218  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
219  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
220  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
221  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
222
223  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
224  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
225  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
226  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
227  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
228  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
229  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
230  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
231  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
232  if (Subtarget->is64Bit()) {
233    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
234    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
235    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
236  }
237
238  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
239  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
240
241  // These should be promoted to a larger select which is supported.
242  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
243  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
244  // X86 wants to expand cmov itself.
245  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
246  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
247  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
248  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
249  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
250  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
251  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
252  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
253  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
254  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
255  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
256  if (Subtarget->is64Bit()) {
257    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
258    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
259  }
260  // X86 ret instruction may pop stack.
261  setOperationAction(ISD::RET             , MVT::Other, Custom);
262  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
263
264  // Darwin ABI issue.
265  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
266  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
267  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
268  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
269  if (Subtarget->is64Bit())
270    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
271  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
272  if (Subtarget->is64Bit()) {
273    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
274    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
275    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
276    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
277  }
278  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
279  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
280  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
281  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
282  if (Subtarget->is64Bit()) {
283    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
284    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
285    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
286  }
287
288  if (Subtarget->hasSSE1())
289    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
290
291  if (!Subtarget->hasSSE2())
292    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
293
294  // Expand certain atomics
295  setOperationAction(ISD::ATOMIC_CMP_SWAP_8 , MVT::i8, Custom);
296  setOperationAction(ISD::ATOMIC_CMP_SWAP_16, MVT::i16, Custom);
297  setOperationAction(ISD::ATOMIC_CMP_SWAP_32, MVT::i32, Custom);
298  setOperationAction(ISD::ATOMIC_CMP_SWAP_64, MVT::i64, Custom);
299
300  setOperationAction(ISD::ATOMIC_LOAD_SUB_8 , MVT::i8, Custom);
301  setOperationAction(ISD::ATOMIC_LOAD_SUB_16, MVT::i16, Custom);
302  setOperationAction(ISD::ATOMIC_LOAD_SUB_32, MVT::i32, Custom);
303  setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Custom);
304
305  // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
306  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
307  // FIXME - use subtarget debug flags
308  if (!Subtarget->isTargetDarwin() &&
309      !Subtarget->isTargetELF() &&
310      !Subtarget->isTargetCygMing()) {
311    setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
312    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
313  }
314
315  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
316  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
317  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
318  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
319  if (Subtarget->is64Bit()) {
320    setExceptionPointerRegister(X86::RAX);
321    setExceptionSelectorRegister(X86::RDX);
322  } else {
323    setExceptionPointerRegister(X86::EAX);
324    setExceptionSelectorRegister(X86::EDX);
325  }
326  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
327  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
328
329  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
330
331  setOperationAction(ISD::TRAP, MVT::Other, Legal);
332
333  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
334  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
335  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
336  if (Subtarget->is64Bit()) {
337    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
338    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
339  } else {
340    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
341    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
342  }
343
344  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
345  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
346  if (Subtarget->is64Bit())
347    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
348  if (Subtarget->isTargetCygMing())
349    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
350  else
351    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
352
353  if (X86ScalarSSEf64) {
354    // f32 and f64 use SSE.
355    // Set up the FP register classes.
356    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
357    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
358
359    // Use ANDPD to simulate FABS.
360    setOperationAction(ISD::FABS , MVT::f64, Custom);
361    setOperationAction(ISD::FABS , MVT::f32, Custom);
362
363    // Use XORP to simulate FNEG.
364    setOperationAction(ISD::FNEG , MVT::f64, Custom);
365    setOperationAction(ISD::FNEG , MVT::f32, Custom);
366
367    // Use ANDPD and ORPD to simulate FCOPYSIGN.
368    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
369    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
370
371    // We don't support sin/cos/fmod
372    setOperationAction(ISD::FSIN , MVT::f64, Expand);
373    setOperationAction(ISD::FCOS , MVT::f64, Expand);
374    setOperationAction(ISD::FSIN , MVT::f32, Expand);
375    setOperationAction(ISD::FCOS , MVT::f32, Expand);
376
377    // Expand FP immediates into loads from the stack, except for the special
378    // cases we handle.
379    addLegalFPImmediate(APFloat(+0.0)); // xorpd
380    addLegalFPImmediate(APFloat(+0.0f)); // xorps
381
382    // Floating truncations from f80 and extensions to f80 go through memory.
383    // If optimizing, we lie about this though and handle it in
384    // InstructionSelectPreprocess so that dagcombine2 can hack on these.
385    if (Fast) {
386      setConvertAction(MVT::f32, MVT::f80, Expand);
387      setConvertAction(MVT::f64, MVT::f80, Expand);
388      setConvertAction(MVT::f80, MVT::f32, Expand);
389      setConvertAction(MVT::f80, MVT::f64, Expand);
390    }
391  } else if (X86ScalarSSEf32) {
392    // Use SSE for f32, x87 for f64.
393    // Set up the FP register classes.
394    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
395    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
396
397    // Use ANDPS to simulate FABS.
398    setOperationAction(ISD::FABS , MVT::f32, Custom);
399
400    // Use XORP to simulate FNEG.
401    setOperationAction(ISD::FNEG , MVT::f32, Custom);
402
403    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
404
405    // Use ANDPS and ORPS to simulate FCOPYSIGN.
406    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
407    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
408
409    // We don't support sin/cos/fmod
410    setOperationAction(ISD::FSIN , MVT::f32, Expand);
411    setOperationAction(ISD::FCOS , MVT::f32, Expand);
412
413    // Special cases we handle for FP constants.
414    addLegalFPImmediate(APFloat(+0.0f)); // xorps
415    addLegalFPImmediate(APFloat(+0.0)); // FLD0
416    addLegalFPImmediate(APFloat(+1.0)); // FLD1
417    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
418    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
419
420    // SSE <-> X87 conversions go through memory.  If optimizing, we lie about
421    // this though and handle it in InstructionSelectPreprocess so that
422    // dagcombine2 can hack on these.
423    if (Fast) {
424      setConvertAction(MVT::f32, MVT::f64, Expand);
425      setConvertAction(MVT::f32, MVT::f80, Expand);
426      setConvertAction(MVT::f80, MVT::f32, Expand);
427      setConvertAction(MVT::f64, MVT::f32, Expand);
428      // And x87->x87 truncations also.
429      setConvertAction(MVT::f80, MVT::f64, Expand);
430    }
431
432    if (!UnsafeFPMath) {
433      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
434      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
435    }
436  } else {
437    // f32 and f64 in x87.
438    // Set up the FP register classes.
439    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
440    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
441
442    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
443    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
444    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
445    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
446
447    // Floating truncations go through memory.  If optimizing, we lie about
448    // this though and handle it in InstructionSelectPreprocess so that
449    // dagcombine2 can hack on these.
450    if (Fast) {
451      setConvertAction(MVT::f80, MVT::f32, Expand);
452      setConvertAction(MVT::f64, MVT::f32, Expand);
453      setConvertAction(MVT::f80, MVT::f64, Expand);
454    }
455
456    if (!UnsafeFPMath) {
457      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
458      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
459    }
460    addLegalFPImmediate(APFloat(+0.0)); // FLD0
461    addLegalFPImmediate(APFloat(+1.0)); // FLD1
462    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
463    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
464    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
465    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
466    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
467    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
468  }
469
470  // Long double always uses X87.
471  addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
472  setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
473  setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
474  {
475    APFloat TmpFlt(+0.0);
476    TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven);
477    addLegalFPImmediate(TmpFlt);  // FLD0
478    TmpFlt.changeSign();
479    addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
480    APFloat TmpFlt2(+1.0);
481    TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven);
482    addLegalFPImmediate(TmpFlt2);  // FLD1
483    TmpFlt2.changeSign();
484    addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
485  }
486
487  if (!UnsafeFPMath) {
488    setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
489    setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
490  }
491
492  // Always use a library call for pow.
493  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
494  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
495  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
496
497  setOperationAction(ISD::FLOG, MVT::f80, Expand);
498  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
499  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
500  setOperationAction(ISD::FEXP, MVT::f80, Expand);
501  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
502
503  // First set operation action for all vector types to expand. Then we
504  // will selectively turn on ones that can be effectively codegen'd.
505  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
506       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
507    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
508    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
509    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
510    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
511    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
512    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
513    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
514    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
515    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
516    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
517    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
518    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
519    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
520    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
521    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
522    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
523    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
524    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
525    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
526    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
527    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
528    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
529    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
530    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
531    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
532    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
533    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
534    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
535    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
536    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
537    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
538    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
539    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
540    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
541    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
542    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
543    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
544    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
545    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
546    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
547    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
548    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
549    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
550  }
551
552  if (Subtarget->hasMMX()) {
553    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
554    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
555    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
556    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
557    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
558
559    // FIXME: add MMX packed arithmetics
560
561    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
562    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
563    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
564    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
565
566    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
567    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
568    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
569    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
570
571    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
572    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
573
574    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
575    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
576    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
577    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
578    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
579    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
580    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
581
582    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
583    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
584    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
585    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
586    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
587    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
588    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
589
590    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
591    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
592    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
593    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
594    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
595    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
596    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
597
598    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
599    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
600    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
601    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
602    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
603    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
604    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
605    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
606    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
607
608    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
609    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
610    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
611    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
612    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
613
614    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
615    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
616    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
617    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
618
619    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
620    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
621    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
622    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
623
624    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
625  }
626
627  if (Subtarget->hasSSE1()) {
628    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
629
630    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
631    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
632    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
633    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
634    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
635    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
636    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
637    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
638    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
639    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
640    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
641    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
642  }
643
644  if (Subtarget->hasSSE2()) {
645    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
646    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
647    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
648    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
649    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
650
651    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
652    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
653    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
654    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
655    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
656    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
657    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
658    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
659    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
660    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
661    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
662    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
663    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
664    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
665    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
666
667    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
668    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
669    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
670    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
671
672    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
673    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
674    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
675    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
676    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
677
678    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
679    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
680      MVT VT = (MVT::SimpleValueType)i;
681      // Do not attempt to custom lower non-power-of-2 vectors
682      if (!isPowerOf2_32(VT.getVectorNumElements()))
683        continue;
684      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
685      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
686      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
687    }
688    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
689    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
690    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
691    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
692    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
693    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
694    if (Subtarget->is64Bit()) {
695      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
696      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
697    }
698
699    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
700    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
701      setOperationAction(ISD::AND,    (MVT::SimpleValueType)VT, Promote);
702      AddPromotedToType (ISD::AND,    (MVT::SimpleValueType)VT, MVT::v2i64);
703      setOperationAction(ISD::OR,     (MVT::SimpleValueType)VT, Promote);
704      AddPromotedToType (ISD::OR,     (MVT::SimpleValueType)VT, MVT::v2i64);
705      setOperationAction(ISD::XOR,    (MVT::SimpleValueType)VT, Promote);
706      AddPromotedToType (ISD::XOR,    (MVT::SimpleValueType)VT, MVT::v2i64);
707      setOperationAction(ISD::LOAD,   (MVT::SimpleValueType)VT, Promote);
708      AddPromotedToType (ISD::LOAD,   (MVT::SimpleValueType)VT, MVT::v2i64);
709      setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
710      AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64);
711    }
712
713    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
714
715    // Custom lower v2i64 and v2f64 selects.
716    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
717    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
718    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
719    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
720
721  }
722
723  if (Subtarget->hasSSE41()) {
724    // FIXME: Do we need to handle scalar-to-vector here?
725    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
726    setOperationAction(ISD::MUL,                MVT::v2i64, Legal);
727
728    // i8 and i16 vectors are custom , because the source register and source
729    // source memory operand types are not the same width.  f32 vectors are
730    // custom since the immediate controlling the insert encodes additional
731    // information.
732    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
733    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
734    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Legal);
735    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
736
737    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
738    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
739    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
740    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
741
742    if (Subtarget->is64Bit()) {
743      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
744      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
745    }
746  }
747
748  if (Subtarget->hasSSE42()) {
749    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
750  }
751
752  // We want to custom lower some of our intrinsics.
753  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
754
755  // We have target-specific dag combine patterns for the following nodes:
756  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
757  setTargetDAGCombine(ISD::BUILD_VECTOR);
758  setTargetDAGCombine(ISD::SELECT);
759  setTargetDAGCombine(ISD::STORE);
760
761  computeRegisterProperties();
762
763  // FIXME: These should be based on subtarget info. Plus, the values should
764  // be smaller when we are in optimizing for size mode.
765  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
766  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
767  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
768  allowUnalignedMemoryAccesses = true; // x86 supports it!
769  setPrefLoopAlignment(16);
770}
771
772
773MVT X86TargetLowering::getSetCCResultType(const SDValue &) const {
774  return MVT::i8;
775}
776
777
778/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
779/// the desired ByVal argument alignment.
780static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
781  if (MaxAlign == 16)
782    return;
783  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
784    if (VTy->getBitWidth() == 128)
785      MaxAlign = 16;
786  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
787    unsigned EltAlign = 0;
788    getMaxByValAlign(ATy->getElementType(), EltAlign);
789    if (EltAlign > MaxAlign)
790      MaxAlign = EltAlign;
791  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
792    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
793      unsigned EltAlign = 0;
794      getMaxByValAlign(STy->getElementType(i), EltAlign);
795      if (EltAlign > MaxAlign)
796        MaxAlign = EltAlign;
797      if (MaxAlign == 16)
798        break;
799    }
800  }
801  return;
802}
803
804/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
805/// function arguments in the caller parameter area. For X86, aggregates
806/// that contain SSE vectors are placed at 16-byte boundaries while the rest
807/// are at 4-byte boundaries.
808unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
809  if (Subtarget->is64Bit()) {
810    // Max of 8 and alignment of type.
811    unsigned TyAlign = TD->getABITypeAlignment(Ty);
812    if (TyAlign > 8)
813      return TyAlign;
814    return 8;
815  }
816
817  unsigned Align = 4;
818  if (Subtarget->hasSSE1())
819    getMaxByValAlign(Ty, Align);
820  return Align;
821}
822
823/// getOptimalMemOpType - Returns the target specific optimal type for load
824/// and store operations as a result of memset, memcpy, and memmove
825/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
826/// determining it.
827MVT
828X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
829                                       bool isSrcConst, bool isSrcStr) const {
830  if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
831    return MVT::v4i32;
832  if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
833    return MVT::v4f32;
834  if (Subtarget->is64Bit() && Size >= 8)
835    return MVT::i64;
836  return MVT::i32;
837}
838
839
840/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
841/// jumptable.
842SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
843                                                      SelectionDAG &DAG) const {
844  if (usesGlobalOffsetTable())
845    return DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, getPointerTy());
846  if (!Subtarget->isPICStyleRIPRel())
847    return DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy());
848  return Table;
849}
850
851//===----------------------------------------------------------------------===//
852//               Return Value Calling Convention Implementation
853//===----------------------------------------------------------------------===//
854
855#include "X86GenCallingConv.inc"
856
857/// LowerRET - Lower an ISD::RET node.
858SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
859  assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
860
861  SmallVector<CCValAssign, 16> RVLocs;
862  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
863  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
864  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
865  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86);
866
867  // If this is the first return lowered for this function, add the regs to the
868  // liveout set for the function.
869  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
870    for (unsigned i = 0; i != RVLocs.size(); ++i)
871      if (RVLocs[i].isRegLoc())
872        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
873  }
874  SDValue Chain = Op.getOperand(0);
875
876  // Handle tail call return.
877  Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL);
878  if (Chain.getOpcode() == X86ISD::TAILCALL) {
879    SDValue TailCall = Chain;
880    SDValue TargetAddress = TailCall.getOperand(1);
881    SDValue StackAdjustment = TailCall.getOperand(2);
882    assert(((TargetAddress.getOpcode() == ISD::Register &&
883               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
884                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
885              TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
886              TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
887             "Expecting an global address, external symbol, or register");
888    assert(StackAdjustment.getOpcode() == ISD::Constant &&
889           "Expecting a const value");
890
891    SmallVector<SDValue,8> Operands;
892    Operands.push_back(Chain.getOperand(0));
893    Operands.push_back(TargetAddress);
894    Operands.push_back(StackAdjustment);
895    // Copy registers used by the call. Last operand is a flag so it is not
896    // copied.
897    for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
898      Operands.push_back(Chain.getOperand(i));
899    }
900    return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0],
901                       Operands.size());
902  }
903
904  // Regular return.
905  SDValue Flag;
906
907  SmallVector<SDValue, 6> RetOps;
908  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
909  // Operand #1 = Bytes To Pop
910  RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
911
912  // Copy the result values into the output registers.
913  for (unsigned i = 0; i != RVLocs.size(); ++i) {
914    CCValAssign &VA = RVLocs[i];
915    assert(VA.isRegLoc() && "Can only return in registers!");
916    SDValue ValToCopy = Op.getOperand(i*2+1);
917
918    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
919    // the RET instruction and handled by the FP Stackifier.
920    if (RVLocs[i].getLocReg() == X86::ST0 ||
921        RVLocs[i].getLocReg() == X86::ST1) {
922      // If this is a copy from an xmm register to ST(0), use an FPExtend to
923      // change the value to the FP stack register class.
924      if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT()))
925        ValToCopy = DAG.getNode(ISD::FP_EXTEND, MVT::f80, ValToCopy);
926      RetOps.push_back(ValToCopy);
927      // Don't emit a copytoreg.
928      continue;
929    }
930
931    Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), ValToCopy, Flag);
932    Flag = Chain.getValue(1);
933  }
934
935  // The x86-64 ABI for returning structs by value requires that we copy
936  // the sret argument into %rax for the return. We saved the argument into
937  // a virtual register in the entry block, so now we copy the value out
938  // and into %rax.
939  if (Subtarget->is64Bit() &&
940      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
941    MachineFunction &MF = DAG.getMachineFunction();
942    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
943    unsigned Reg = FuncInfo->getSRetReturnReg();
944    if (!Reg) {
945      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
946      FuncInfo->setSRetReturnReg(Reg);
947    }
948    SDValue Val = DAG.getCopyFromReg(Chain, Reg, getPointerTy());
949
950    Chain = DAG.getCopyToReg(Chain, X86::RAX, Val, Flag);
951    Flag = Chain.getValue(1);
952  }
953
954  RetOps[0] = Chain;  // Update chain.
955
956  // Add the flag if we have it.
957  if (Flag.getNode())
958    RetOps.push_back(Flag);
959
960  return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, &RetOps[0], RetOps.size());
961}
962
963
964/// LowerCallResult - Lower the result values of an ISD::CALL into the
965/// appropriate copies out of appropriate physical registers.  This assumes that
966/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
967/// being lowered.  The returns a SDNode with the same number of values as the
968/// ISD::CALL.
969SDNode *X86TargetLowering::
970LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
971                unsigned CallingConv, SelectionDAG &DAG) {
972
973  // Assign locations to each value returned by this call.
974  SmallVector<CCValAssign, 16> RVLocs;
975  bool isVarArg = TheCall->isVarArg();
976  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
977  CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
978
979  SmallVector<SDValue, 8> ResultVals;
980
981  // Copy all of the result registers out of their specified physreg.
982  for (unsigned i = 0; i != RVLocs.size(); ++i) {
983    MVT CopyVT = RVLocs[i].getValVT();
984
985    // If this is a call to a function that returns an fp value on the floating
986    // point stack, but where we prefer to use the value in xmm registers, copy
987    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
988    if ((RVLocs[i].getLocReg() == X86::ST0 ||
989         RVLocs[i].getLocReg() == X86::ST1) &&
990        isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
991      CopyVT = MVT::f80;
992    }
993
994    Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(),
995                               CopyVT, InFlag).getValue(1);
996    SDValue Val = Chain.getValue(0);
997    InFlag = Chain.getValue(2);
998
999    if (CopyVT != RVLocs[i].getValVT()) {
1000      // Round the F80 the right size, which also moves to the appropriate xmm
1001      // register.
1002      Val = DAG.getNode(ISD::FP_ROUND, RVLocs[i].getValVT(), Val,
1003                        // This truncation won't change the value.
1004                        DAG.getIntPtrConstant(1));
1005    }
1006
1007    ResultVals.push_back(Val);
1008  }
1009
1010  // Merge everything together with a MERGE_VALUES node.
1011  ResultVals.push_back(Chain);
1012  return DAG.getMergeValues(TheCall->getVTList(), &ResultVals[0],
1013                            ResultVals.size()).getNode();
1014}
1015
1016
1017//===----------------------------------------------------------------------===//
1018//                C & StdCall & Fast Calling Convention implementation
1019//===----------------------------------------------------------------------===//
1020//  StdCall calling convention seems to be standard for many Windows' API
1021//  routines and around. It differs from C calling convention just a little:
1022//  callee should clean up the stack, not caller. Symbols should be also
1023//  decorated in some fancy way :) It doesn't support any vector arguments.
1024//  For info on fast calling convention see Fast Calling Convention (tail call)
1025//  implementation LowerX86_32FastCCCallTo.
1026
1027/// AddLiveIn - This helper function adds the specified physical register to the
1028/// MachineFunction as a live in value.  It also creates a corresponding virtual
1029/// register for it.
1030static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
1031                          const TargetRegisterClass *RC) {
1032  assert(RC->contains(PReg) && "Not the correct regclass!");
1033  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
1034  MF.getRegInfo().addLiveIn(PReg, VReg);
1035  return VReg;
1036}
1037
1038/// CallIsStructReturn - Determines whether a CALL node uses struct return
1039/// semantics.
1040static bool CallIsStructReturn(CallSDNode *TheCall) {
1041  unsigned NumOps = TheCall->getNumArgs();
1042  if (!NumOps)
1043    return false;
1044
1045  return TheCall->getArgFlags(0).isSRet();
1046}
1047
1048/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
1049/// return semantics.
1050static bool ArgsAreStructReturn(SDValue Op) {
1051  unsigned NumArgs = Op.getNode()->getNumValues() - 1;
1052  if (!NumArgs)
1053    return false;
1054
1055  return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet();
1056}
1057
1058/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
1059/// the callee to pop its own arguments. Callee pop is necessary to support tail
1060/// calls.
1061bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
1062  if (IsVarArg)
1063    return false;
1064
1065  switch (CallingConv) {
1066  default:
1067    return false;
1068  case CallingConv::X86_StdCall:
1069    return !Subtarget->is64Bit();
1070  case CallingConv::X86_FastCall:
1071    return !Subtarget->is64Bit();
1072  case CallingConv::Fast:
1073    return PerformTailCallOpt;
1074  }
1075}
1076
1077/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1078/// given CallingConvention value.
1079CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
1080  if (Subtarget->is64Bit()) {
1081    if (Subtarget->isTargetWin64())
1082      return CC_X86_Win64_C;
1083    else if (CC == CallingConv::Fast && PerformTailCallOpt)
1084      return CC_X86_64_TailCall;
1085    else
1086      return CC_X86_64_C;
1087  }
1088
1089  if (CC == CallingConv::X86_FastCall)
1090    return CC_X86_32_FastCall;
1091  else if (CC == CallingConv::Fast)
1092    return CC_X86_32_FastCC;
1093  else
1094    return CC_X86_32_C;
1095}
1096
1097/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
1098/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
1099NameDecorationStyle
1100X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
1101  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1102  if (CC == CallingConv::X86_FastCall)
1103    return FastCall;
1104  else if (CC == CallingConv::X86_StdCall)
1105    return StdCall;
1106  return None;
1107}
1108
1109
1110/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
1111/// in a register before calling.
1112bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
1113  return !IsTailCall && !Is64Bit &&
1114    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1115    Subtarget->isPICStyleGOT();
1116}
1117
1118/// CallRequiresFnAddressInReg - Check whether the call requires the function
1119/// address to be loaded in a register.
1120bool
1121X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
1122  return !Is64Bit && IsTailCall &&
1123    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1124    Subtarget->isPICStyleGOT();
1125}
1126
1127/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1128/// by "Src" to address "Dst" with size and alignment information specified by
1129/// the specific parameter attribute. The copy will be passed as a byval
1130/// function parameter.
1131static SDValue
1132CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1133                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG) {
1134  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1135  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, Flags.getByValAlign(),
1136                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
1137}
1138
1139SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG,
1140                                              const CCValAssign &VA,
1141                                              MachineFrameInfo *MFI,
1142                                              unsigned CC,
1143                                              SDValue Root, unsigned i) {
1144  // Create the nodes corresponding to a load from this parameter slot.
1145  ISD::ArgFlagsTy Flags =
1146    cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags();
1147  bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
1148  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1149
1150  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1151  // changed with more analysis.
1152  // In case of tail call optimization mark all arguments mutable. Since they
1153  // could be overwritten by lowering of arguments in case of a tail call.
1154  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
1155                                  VA.getLocMemOffset(), isImmutable);
1156  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1157  if (Flags.isByVal())
1158    return FIN;
1159  return DAG.getLoad(VA.getValVT(), Root, FIN,
1160                     PseudoSourceValue::getFixedStack(FI), 0);
1161}
1162
1163SDValue
1164X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
1165  MachineFunction &MF = DAG.getMachineFunction();
1166  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1167
1168  const Function* Fn = MF.getFunction();
1169  if (Fn->hasExternalLinkage() &&
1170      Subtarget->isTargetCygMing() &&
1171      Fn->getName() == "main")
1172    FuncInfo->setForceFramePointer(true);
1173
1174  // Decorate the function name.
1175  FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
1176
1177  MachineFrameInfo *MFI = MF.getFrameInfo();
1178  SDValue Root = Op.getOperand(0);
1179  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
1180  unsigned CC = MF.getFunction()->getCallingConv();
1181  bool Is64Bit = Subtarget->is64Bit();
1182  bool IsWin64 = Subtarget->isTargetWin64();
1183
1184  assert(!(isVarArg && CC == CallingConv::Fast) &&
1185         "Var args not supported with calling convention fastcc");
1186
1187  // Assign locations to all of the incoming arguments.
1188  SmallVector<CCValAssign, 16> ArgLocs;
1189  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
1190  CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
1191
1192  SmallVector<SDValue, 8> ArgValues;
1193  unsigned LastVal = ~0U;
1194  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1195    CCValAssign &VA = ArgLocs[i];
1196    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1197    // places.
1198    assert(VA.getValNo() != LastVal &&
1199           "Don't support value assigned to multiple locs yet");
1200    LastVal = VA.getValNo();
1201
1202    if (VA.isRegLoc()) {
1203      MVT RegVT = VA.getLocVT();
1204      TargetRegisterClass *RC;
1205      if (RegVT == MVT::i32)
1206        RC = X86::GR32RegisterClass;
1207      else if (Is64Bit && RegVT == MVT::i64)
1208        RC = X86::GR64RegisterClass;
1209      else if (RegVT == MVT::f32)
1210        RC = X86::FR32RegisterClass;
1211      else if (RegVT == MVT::f64)
1212        RC = X86::FR64RegisterClass;
1213      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1214        RC = X86::VR128RegisterClass;
1215      else if (RegVT.isVector()) {
1216        assert(RegVT.getSizeInBits() == 64);
1217        if (!Is64Bit)
1218          RC = X86::VR64RegisterClass;     // MMX values are passed in MMXs.
1219        else {
1220          // Darwin calling convention passes MMX values in either GPRs or
1221          // XMMs in x86-64. Other targets pass them in memory.
1222          if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
1223            RC = X86::VR128RegisterClass;  // MMX values are passed in XMMs.
1224            RegVT = MVT::v2i64;
1225          } else {
1226            RC = X86::GR64RegisterClass;   // v1i64 values are passed in GPRs.
1227            RegVT = MVT::i64;
1228          }
1229        }
1230      } else {
1231        assert(0 && "Unknown argument type!");
1232      }
1233
1234      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
1235      SDValue ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
1236
1237      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1238      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1239      // right size.
1240      if (VA.getLocInfo() == CCValAssign::SExt)
1241        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
1242                               DAG.getValueType(VA.getValVT()));
1243      else if (VA.getLocInfo() == CCValAssign::ZExt)
1244        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
1245                               DAG.getValueType(VA.getValVT()));
1246
1247      if (VA.getLocInfo() != CCValAssign::Full)
1248        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
1249
1250      // Handle MMX values passed in GPRs.
1251      if (Is64Bit && RegVT != VA.getLocVT()) {
1252        if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
1253          ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
1254        else if (RC == X86::VR128RegisterClass) {
1255          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i64, ArgValue,
1256                                 DAG.getConstant(0, MVT::i64));
1257          ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
1258        }
1259      }
1260
1261      ArgValues.push_back(ArgValue);
1262    } else {
1263      assert(VA.isMemLoc());
1264      ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
1265    }
1266  }
1267
1268  // The x86-64 ABI for returning structs by value requires that we copy
1269  // the sret argument into %rax for the return. Save the argument into
1270  // a virtual register so that we can access it from the return points.
1271  if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1272    MachineFunction &MF = DAG.getMachineFunction();
1273    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1274    unsigned Reg = FuncInfo->getSRetReturnReg();
1275    if (!Reg) {
1276      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1277      FuncInfo->setSRetReturnReg(Reg);
1278    }
1279    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), Reg, ArgValues[0]);
1280    Root = DAG.getNode(ISD::TokenFactor, MVT::Other, Copy, Root);
1281  }
1282
1283  unsigned StackSize = CCInfo.getNextStackOffset();
1284  // align stack specially for tail calls
1285  if (PerformTailCallOpt && CC == CallingConv::Fast)
1286    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1287
1288  // If the function takes variable number of arguments, make a frame index for
1289  // the start of the first vararg value... for expansion of llvm.va_start.
1290  if (isVarArg) {
1291    if (Is64Bit || CC != CallingConv::X86_FastCall) {
1292      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
1293    }
1294    if (Is64Bit) {
1295      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1296
1297      // FIXME: We should really autogenerate these arrays
1298      static const unsigned GPR64ArgRegsWin64[] = {
1299        X86::RCX, X86::RDX, X86::R8,  X86::R9
1300      };
1301      static const unsigned XMMArgRegsWin64[] = {
1302        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1303      };
1304      static const unsigned GPR64ArgRegs64Bit[] = {
1305        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1306      };
1307      static const unsigned XMMArgRegs64Bit[] = {
1308        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1309        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1310      };
1311      const unsigned *GPR64ArgRegs, *XMMArgRegs;
1312
1313      if (IsWin64) {
1314        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1315        GPR64ArgRegs = GPR64ArgRegsWin64;
1316        XMMArgRegs = XMMArgRegsWin64;
1317      } else {
1318        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1319        GPR64ArgRegs = GPR64ArgRegs64Bit;
1320        XMMArgRegs = XMMArgRegs64Bit;
1321      }
1322      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1323                                                       TotalNumIntRegs);
1324      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1325                                                       TotalNumXMMRegs);
1326
1327      // For X86-64, if there are vararg parameters that are passed via
1328      // registers, then we must store them to their spots on the stack so they
1329      // may be loaded by deferencing the result of va_next.
1330      VarArgsGPOffset = NumIntRegs * 8;
1331      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1332      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1333                                                 TotalNumXMMRegs * 16, 16);
1334
1335      // Store the integer parameter registers.
1336      SmallVector<SDValue, 8> MemOps;
1337      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1338      SDValue FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
1339                                  DAG.getIntPtrConstant(VarArgsGPOffset));
1340      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1341        unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
1342                                  X86::GR64RegisterClass);
1343        SDValue Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
1344        SDValue Store =
1345          DAG.getStore(Val.getValue(1), Val, FIN,
1346                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1347        MemOps.push_back(Store);
1348        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
1349                          DAG.getIntPtrConstant(8));
1350      }
1351
1352      // Now store the XMM (fp + vector) parameter registers.
1353      FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
1354                        DAG.getIntPtrConstant(VarArgsFPOffset));
1355      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1356        unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
1357                                  X86::VR128RegisterClass);
1358        SDValue Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
1359        SDValue Store =
1360          DAG.getStore(Val.getValue(1), Val, FIN,
1361                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1362        MemOps.push_back(Store);
1363        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
1364                          DAG.getIntPtrConstant(16));
1365      }
1366      if (!MemOps.empty())
1367          Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
1368                             &MemOps[0], MemOps.size());
1369    }
1370  }
1371
1372  ArgValues.push_back(Root);
1373
1374  // Some CCs need callee pop.
1375  if (IsCalleePop(isVarArg, CC)) {
1376    BytesToPopOnReturn  = StackSize; // Callee pops everything.
1377    BytesCallerReserves = 0;
1378  } else {
1379    BytesToPopOnReturn  = 0; // Callee pops nothing.
1380    // If this is an sret function, the return should pop the hidden pointer.
1381    if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
1382      BytesToPopOnReturn = 4;
1383    BytesCallerReserves = StackSize;
1384  }
1385
1386  if (!Is64Bit) {
1387    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
1388    if (CC == CallingConv::X86_FastCall)
1389      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
1390  }
1391
1392  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1393
1394  // Return the new list of results.
1395  return DAG.getMergeValues(Op.getNode()->getVTList(), &ArgValues[0],
1396                            ArgValues.size()).getValue(Op.getResNo());
1397}
1398
1399SDValue
1400X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
1401                                    const SDValue &StackPtr,
1402                                    const CCValAssign &VA,
1403                                    SDValue Chain,
1404                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
1405  unsigned LocMemOffset = VA.getLocMemOffset();
1406  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1407  PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
1408  if (Flags.isByVal()) {
1409    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
1410  }
1411  return DAG.getStore(Chain, Arg, PtrOff,
1412                      PseudoSourceValue::getStack(), LocMemOffset);
1413}
1414
1415/// EmitTailCallLoadRetAddr - Emit a load of return adress if tail call
1416/// optimization is performed and it is required.
1417SDValue
1418X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1419                                           SDValue &OutRetAddr,
1420                                           SDValue Chain,
1421                                           bool IsTailCall,
1422                                           bool Is64Bit,
1423                                           int FPDiff) {
1424  if (!IsTailCall || FPDiff==0) return Chain;
1425
1426  // Adjust the Return address stack slot.
1427  MVT VT = getPointerTy();
1428  OutRetAddr = getReturnAddressFrameIndex(DAG);
1429  // Load the "old" Return address.
1430  OutRetAddr = DAG.getLoad(VT, Chain,OutRetAddr, NULL, 0);
1431  return SDValue(OutRetAddr.getNode(), 1);
1432}
1433
1434/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1435/// optimization is performed and it is required (FPDiff!=0).
1436static SDValue
1437EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1438                         SDValue Chain, SDValue RetAddrFrIdx,
1439                         bool Is64Bit, int FPDiff) {
1440  // Store the return address to the appropriate stack slot.
1441  if (!FPDiff) return Chain;
1442  // Calculate the new stack slot for the return address.
1443  int SlotSize = Is64Bit ? 8 : 4;
1444  int NewReturnAddrFI =
1445    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
1446  MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1447  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1448  Chain = DAG.getStore(Chain, RetAddrFrIdx, NewRetAddrFrIdx,
1449                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
1450  return Chain;
1451}
1452
1453SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
1454  MachineFunction &MF = DAG.getMachineFunction();
1455  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
1456  SDValue Chain       = TheCall->getChain();
1457  unsigned CC         = TheCall->getCallingConv();
1458  bool isVarArg       = TheCall->isVarArg();
1459  bool IsTailCall     = TheCall->isTailCall() &&
1460                        CC == CallingConv::Fast && PerformTailCallOpt;
1461  SDValue Callee      = TheCall->getCallee();
1462  bool Is64Bit        = Subtarget->is64Bit();
1463  bool IsStructRet    = CallIsStructReturn(TheCall);
1464
1465  assert(!(isVarArg && CC == CallingConv::Fast) &&
1466         "Var args not supported with calling convention fastcc");
1467
1468  // Analyze operands of the call, assigning locations to each operand.
1469  SmallVector<CCValAssign, 16> ArgLocs;
1470  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
1471  CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
1472
1473  // Get a count of how many bytes are to be pushed on the stack.
1474  unsigned NumBytes = CCInfo.getNextStackOffset();
1475  if (PerformTailCallOpt && CC == CallingConv::Fast)
1476    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1477
1478  int FPDiff = 0;
1479  if (IsTailCall) {
1480    // Lower arguments at fp - stackoffset + fpdiff.
1481    unsigned NumBytesCallerPushed =
1482      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1483    FPDiff = NumBytesCallerPushed - NumBytes;
1484
1485    // Set the delta of movement of the returnaddr stackslot.
1486    // But only set if delta is greater than previous delta.
1487    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1488      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1489  }
1490
1491  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes));
1492
1493  SDValue RetAddrFrIdx;
1494  // Load return adress for tail calls.
1495  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit,
1496                                  FPDiff);
1497
1498  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1499  SmallVector<SDValue, 8> MemOpChains;
1500  SDValue StackPtr;
1501
1502  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1503  // of tail call optimization arguments are handle later.
1504  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1505    CCValAssign &VA = ArgLocs[i];
1506    SDValue Arg = TheCall->getArg(i);
1507    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
1508    bool isByVal = Flags.isByVal();
1509
1510    // Promote the value if needed.
1511    switch (VA.getLocInfo()) {
1512    default: assert(0 && "Unknown loc info!");
1513    case CCValAssign::Full: break;
1514    case CCValAssign::SExt:
1515      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
1516      break;
1517    case CCValAssign::ZExt:
1518      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
1519      break;
1520    case CCValAssign::AExt:
1521      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
1522      break;
1523    }
1524
1525    if (VA.isRegLoc()) {
1526      if (Is64Bit) {
1527        MVT RegVT = VA.getLocVT();
1528        if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1529          switch (VA.getLocReg()) {
1530          default:
1531            break;
1532          case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
1533          case X86::R8: {
1534            // Special case: passing MMX values in GPR registers.
1535            Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg);
1536            break;
1537          }
1538          case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
1539          case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
1540            // Special case: passing MMX values in XMM registers.
1541            Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg);
1542            Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Arg);
1543            Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
1544                              DAG.getNode(ISD::UNDEF, MVT::v2i64), Arg,
1545                              getMOVLMask(2, DAG));
1546            break;
1547          }
1548          }
1549      }
1550      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1551    } else {
1552      if (!IsTailCall || (IsTailCall && isByVal)) {
1553        assert(VA.isMemLoc());
1554        if (StackPtr.getNode() == 0)
1555          StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
1556
1557        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
1558                                               Chain, Arg, Flags));
1559      }
1560    }
1561  }
1562
1563  if (!MemOpChains.empty())
1564    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
1565                        &MemOpChains[0], MemOpChains.size());
1566
1567  // Build a sequence of copy-to-reg nodes chained together with token chain
1568  // and flag operands which copy the outgoing args into registers.
1569  SDValue InFlag;
1570  // Tail call byval lowering might overwrite argument registers so in case of
1571  // tail call optimization the copies to registers are lowered later.
1572  if (!IsTailCall)
1573    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1574      Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
1575                               InFlag);
1576      InFlag = Chain.getValue(1);
1577    }
1578
1579  // ELF / PIC requires GOT in the EBX register before function calls via PLT
1580  // GOT pointer.
1581  if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
1582    Chain = DAG.getCopyToReg(Chain, X86::EBX,
1583                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
1584                             InFlag);
1585    InFlag = Chain.getValue(1);
1586  }
1587  // If we are tail calling and generating PIC/GOT style code load the address
1588  // of the callee into ecx. The value in ecx is used as target of the tail
1589  // jump. This is done to circumvent the ebx/callee-saved problem for tail
1590  // calls on PIC/GOT architectures. Normally we would just put the address of
1591  // GOT into ebx and then call target@PLT. But for tail callss ebx would be
1592  // restored (since ebx is callee saved) before jumping to the target@PLT.
1593  if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
1594    // Note: The actual moving to ecx is done further down.
1595    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1596    if (G && !G->getGlobal()->hasHiddenVisibility() &&
1597        !G->getGlobal()->hasProtectedVisibility())
1598      Callee =  LowerGlobalAddress(Callee, DAG);
1599    else if (isa<ExternalSymbolSDNode>(Callee))
1600      Callee = LowerExternalSymbol(Callee,DAG);
1601  }
1602
1603  if (Is64Bit && isVarArg) {
1604    // From AMD64 ABI document:
1605    // For calls that may call functions that use varargs or stdargs
1606    // (prototype-less calls or calls to functions containing ellipsis (...) in
1607    // the declaration) %al is used as hidden argument to specify the number
1608    // of SSE registers used. The contents of %al do not need to match exactly
1609    // the number of registers, but must be an ubound on the number of SSE
1610    // registers used and is in the range 0 - 8 inclusive.
1611
1612    // FIXME: Verify this on Win64
1613    // Count the number of XMM registers allocated.
1614    static const unsigned XMMArgRegs[] = {
1615      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1616      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1617    };
1618    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1619
1620    Chain = DAG.getCopyToReg(Chain, X86::AL,
1621                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1622    InFlag = Chain.getValue(1);
1623  }
1624
1625
1626  // For tail calls lower the arguments to the 'real' stack slot.
1627  if (IsTailCall) {
1628    SmallVector<SDValue, 8> MemOpChains2;
1629    SDValue FIN;
1630    int FI = 0;
1631    // Do not flag preceeding copytoreg stuff together with the following stuff.
1632    InFlag = SDValue();
1633    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1634      CCValAssign &VA = ArgLocs[i];
1635      if (!VA.isRegLoc()) {
1636        assert(VA.isMemLoc());
1637        SDValue Arg = TheCall->getArg(i);
1638        ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
1639        // Create frame index.
1640        int32_t Offset = VA.getLocMemOffset()+FPDiff;
1641        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
1642        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1643        FIN = DAG.getFrameIndex(FI, getPointerTy());
1644
1645        if (Flags.isByVal()) {
1646          // Copy relative to framepointer.
1647          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
1648          if (StackPtr.getNode() == 0)
1649            StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
1650          Source = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, Source);
1651
1652          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
1653                                                           Flags, DAG));
1654        } else {
1655          // Store relative to framepointer.
1656          MemOpChains2.push_back(
1657            DAG.getStore(Chain, Arg, FIN,
1658                         PseudoSourceValue::getFixedStack(FI), 0));
1659        }
1660      }
1661    }
1662
1663    if (!MemOpChains2.empty())
1664      Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
1665                          &MemOpChains2[0], MemOpChains2.size());
1666
1667    // Copy arguments to their registers.
1668    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1669      Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
1670                               InFlag);
1671      InFlag = Chain.getValue(1);
1672    }
1673    InFlag =SDValue();
1674
1675    // Store the return address to the appropriate stack slot.
1676    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
1677                                     FPDiff);
1678  }
1679
1680  // If the callee is a GlobalAddress node (quite common, every direct call is)
1681  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
1682  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1683    // We should use extra load for direct calls to dllimported functions in
1684    // non-JIT mode.
1685    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
1686                                        getTargetMachine(), true))
1687      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
1688  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1689    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
1690  } else if (IsTailCall) {
1691    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
1692
1693    Chain = DAG.getCopyToReg(Chain,
1694                             DAG.getRegister(Opc, getPointerTy()),
1695                             Callee,InFlag);
1696    Callee = DAG.getRegister(Opc, getPointerTy());
1697    // Add register as live out.
1698    DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
1699  }
1700
1701  // Returns a chain & a flag for retval copy to use.
1702  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
1703  SmallVector<SDValue, 8> Ops;
1704
1705  if (IsTailCall) {
1706    Ops.push_back(Chain);
1707    Ops.push_back(DAG.getIntPtrConstant(NumBytes));
1708    Ops.push_back(DAG.getIntPtrConstant(0));
1709    if (InFlag.getNode())
1710      Ops.push_back(InFlag);
1711    Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
1712    InFlag = Chain.getValue(1);
1713
1714    // Returns a chain & a flag for retval copy to use.
1715    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
1716    Ops.clear();
1717  }
1718
1719  Ops.push_back(Chain);
1720  Ops.push_back(Callee);
1721
1722  if (IsTailCall)
1723    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
1724
1725  // Add argument registers to the end of the list so that they are known live
1726  // into the call.
1727  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1728    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1729                                  RegsToPass[i].second.getValueType()));
1730
1731  // Add an implicit use GOT pointer in EBX.
1732  if (!IsTailCall && !Is64Bit &&
1733      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1734      Subtarget->isPICStyleGOT())
1735    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
1736
1737  // Add an implicit use of AL for x86 vararg functions.
1738  if (Is64Bit && isVarArg)
1739    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
1740
1741  if (InFlag.getNode())
1742    Ops.push_back(InFlag);
1743
1744  if (IsTailCall) {
1745    assert(InFlag.getNode() &&
1746           "Flag must be set. Depend on flag being set in LowerRET");
1747    Chain = DAG.getNode(X86ISD::TAILCALL,
1748                        TheCall->getVTList(), &Ops[0], Ops.size());
1749
1750    return SDValue(Chain.getNode(), Op.getResNo());
1751  }
1752
1753  Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
1754  InFlag = Chain.getValue(1);
1755
1756  // Create the CALLSEQ_END node.
1757  unsigned NumBytesForCalleeToPush;
1758  if (IsCalleePop(isVarArg, CC))
1759    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
1760  else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
1761    // If this is is a call to a struct-return function, the callee
1762    // pops the hidden struct pointer, so we have to push it back.
1763    // This is common for Darwin/X86, Linux & Mingw32 targets.
1764    NumBytesForCalleeToPush = 4;
1765  else
1766    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
1767
1768  // Returns a flag for retval copy to use.
1769  Chain = DAG.getCALLSEQ_END(Chain,
1770                             DAG.getIntPtrConstant(NumBytes),
1771                             DAG.getIntPtrConstant(NumBytesForCalleeToPush),
1772                             InFlag);
1773  InFlag = Chain.getValue(1);
1774
1775  // Handle result values, copying them out of physregs into vregs that we
1776  // return.
1777  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
1778                 Op.getResNo());
1779}
1780
1781
1782//===----------------------------------------------------------------------===//
1783//                Fast Calling Convention (tail call) implementation
1784//===----------------------------------------------------------------------===//
1785
1786//  Like std call, callee cleans arguments, convention except that ECX is
1787//  reserved for storing the tail called function address. Only 2 registers are
1788//  free for argument passing (inreg). Tail call optimization is performed
1789//  provided:
1790//                * tailcallopt is enabled
1791//                * caller/callee are fastcc
1792//  On X86_64 architecture with GOT-style position independent code only local
1793//  (within module) calls are supported at the moment.
1794//  To keep the stack aligned according to platform abi the function
1795//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
1796//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
1797//  If a tail called function callee has more arguments than the caller the
1798//  caller needs to make sure that there is room to move the RETADDR to. This is
1799//  achieved by reserving an area the size of the argument delta right after the
1800//  original REtADDR, but before the saved framepointer or the spilled registers
1801//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
1802//  stack layout:
1803//    arg1
1804//    arg2
1805//    RETADDR
1806//    [ new RETADDR
1807//      move area ]
1808//    (possible EBP)
1809//    ESI
1810//    EDI
1811//    local1 ..
1812
1813/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
1814/// for a 16 byte align requirement.
1815unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
1816                                                        SelectionDAG& DAG) {
1817  MachineFunction &MF = DAG.getMachineFunction();
1818  const TargetMachine &TM = MF.getTarget();
1819  const TargetFrameInfo &TFI = *TM.getFrameInfo();
1820  unsigned StackAlignment = TFI.getStackAlignment();
1821  uint64_t AlignMask = StackAlignment - 1;
1822  int64_t Offset = StackSize;
1823  uint64_t SlotSize = TD->getPointerSize();
1824  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
1825    // Number smaller than 12 so just add the difference.
1826    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
1827  } else {
1828    // Mask out lower bits, add stackalignment once plus the 12 bytes.
1829    Offset = ((~AlignMask) & Offset) + StackAlignment +
1830      (StackAlignment-SlotSize);
1831  }
1832  return Offset;
1833}
1834
1835/// IsEligibleForTailCallElimination - Check to see whether the next instruction
1836/// following the call is a return. A function is eligible if caller/callee
1837/// calling conventions match, currently only fastcc supports tail calls, and
1838/// the function CALL is immediatly followed by a RET.
1839bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
1840                                                      SDValue Ret,
1841                                                      SelectionDAG& DAG) const {
1842  if (!PerformTailCallOpt)
1843    return false;
1844
1845  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
1846    MachineFunction &MF = DAG.getMachineFunction();
1847    unsigned CallerCC = MF.getFunction()->getCallingConv();
1848    unsigned CalleeCC= TheCall->getCallingConv();
1849    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
1850      SDValue Callee = TheCall->getCallee();
1851      // On x86/32Bit PIC/GOT  tail calls are supported.
1852      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
1853          !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
1854        return true;
1855
1856      // Can only do local tail calls (in same module, hidden or protected) on
1857      // x86_64 PIC/GOT at the moment.
1858      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1859        return G->getGlobal()->hasHiddenVisibility()
1860            || G->getGlobal()->hasProtectedVisibility();
1861    }
1862  }
1863
1864  return false;
1865}
1866
1867FastISel *
1868X86TargetLowering::createFastISel(MachineFunction &mf,
1869                                  MachineModuleInfo *mmo,
1870                                  DenseMap<const Value *, unsigned> &vm,
1871                                  DenseMap<const BasicBlock *,
1872                                           MachineBasicBlock *> &bm,
1873                                  DenseMap<const AllocaInst *, int> &am) {
1874
1875  return X86::createFastISel(mf, mmo, vm, bm, am);
1876}
1877
1878
1879//===----------------------------------------------------------------------===//
1880//                           Other Lowering Hooks
1881//===----------------------------------------------------------------------===//
1882
1883
1884SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
1885  MachineFunction &MF = DAG.getMachineFunction();
1886  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1887  int ReturnAddrIndex = FuncInfo->getRAIndex();
1888  uint64_t SlotSize = TD->getPointerSize();
1889
1890  if (ReturnAddrIndex == 0) {
1891    // Set up a frame object for the return address.
1892    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
1893    FuncInfo->setRAIndex(ReturnAddrIndex);
1894  }
1895
1896  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
1897}
1898
1899
1900/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
1901/// specific condition code. It returns a false if it cannot do a direct
1902/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
1903/// needed.
1904static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
1905                           unsigned &X86CC, SDValue &LHS, SDValue &RHS,
1906                           SelectionDAG &DAG) {
1907  X86CC = X86::COND_INVALID;
1908  if (!isFP) {
1909    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
1910      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
1911        // X > -1   -> X == 0, jump !sign.
1912        RHS = DAG.getConstant(0, RHS.getValueType());
1913        X86CC = X86::COND_NS;
1914        return true;
1915      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
1916        // X < 0   -> X == 0, jump on sign.
1917        X86CC = X86::COND_S;
1918        return true;
1919      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
1920        // X < 1   -> X <= 0
1921        RHS = DAG.getConstant(0, RHS.getValueType());
1922        X86CC = X86::COND_LE;
1923        return true;
1924      }
1925    }
1926
1927    switch (SetCCOpcode) {
1928    default: break;
1929    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
1930    case ISD::SETGT:  X86CC = X86::COND_G;  break;
1931    case ISD::SETGE:  X86CC = X86::COND_GE; break;
1932    case ISD::SETLT:  X86CC = X86::COND_L;  break;
1933    case ISD::SETLE:  X86CC = X86::COND_LE; break;
1934    case ISD::SETNE:  X86CC = X86::COND_NE; break;
1935    case ISD::SETULT: X86CC = X86::COND_B;  break;
1936    case ISD::SETUGT: X86CC = X86::COND_A;  break;
1937    case ISD::SETULE: X86CC = X86::COND_BE; break;
1938    case ISD::SETUGE: X86CC = X86::COND_AE; break;
1939    }
1940  } else {
1941    // First determine if it requires or is profitable to flip the operands.
1942    bool Flip = false;
1943    switch (SetCCOpcode) {
1944    default: break;
1945    case ISD::SETOLT:
1946    case ISD::SETOLE:
1947    case ISD::SETUGT:
1948    case ISD::SETUGE:
1949      Flip = true;
1950      break;
1951    }
1952
1953    // If LHS is a foldable load, but RHS is not, flip the condition.
1954    if (!Flip &&
1955        (ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
1956        !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
1957      SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
1958      Flip = true;
1959    }
1960    if (Flip)
1961      std::swap(LHS, RHS);
1962
1963    // On a floating point condition, the flags are set as follows:
1964    // ZF  PF  CF   op
1965    //  0 | 0 | 0 | X > Y
1966    //  0 | 0 | 1 | X < Y
1967    //  1 | 0 | 0 | X == Y
1968    //  1 | 1 | 1 | unordered
1969    switch (SetCCOpcode) {
1970    default: break;
1971    case ISD::SETUEQ:
1972    case ISD::SETEQ:
1973      X86CC = X86::COND_E;
1974      break;
1975    case ISD::SETOLT:              // flipped
1976    case ISD::SETOGT:
1977    case ISD::SETGT:
1978      X86CC = X86::COND_A;
1979      break;
1980    case ISD::SETOLE:              // flipped
1981    case ISD::SETOGE:
1982    case ISD::SETGE:
1983      X86CC = X86::COND_AE;
1984      break;
1985    case ISD::SETUGT:              // flipped
1986    case ISD::SETULT:
1987    case ISD::SETLT:
1988      X86CC = X86::COND_B;
1989      break;
1990    case ISD::SETUGE:              // flipped
1991    case ISD::SETULE:
1992    case ISD::SETLE:
1993      X86CC = X86::COND_BE;
1994      break;
1995    case ISD::SETONE:
1996    case ISD::SETNE:
1997      X86CC = X86::COND_NE;
1998      break;
1999    case ISD::SETUO:
2000      X86CC = X86::COND_P;
2001      break;
2002    case ISD::SETO:
2003      X86CC = X86::COND_NP;
2004      break;
2005    }
2006  }
2007
2008  return X86CC != X86::COND_INVALID;
2009}
2010
2011/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2012/// code. Current x86 isa includes the following FP cmov instructions:
2013/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2014static bool hasFPCMov(unsigned X86CC) {
2015  switch (X86CC) {
2016  default:
2017    return false;
2018  case X86::COND_B:
2019  case X86::COND_BE:
2020  case X86::COND_E:
2021  case X86::COND_P:
2022  case X86::COND_A:
2023  case X86::COND_AE:
2024  case X86::COND_NE:
2025  case X86::COND_NP:
2026    return true;
2027  }
2028}
2029
2030/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode.  Return
2031/// true if Op is undef or if its value falls within the specified range (L, H].
2032static bool isUndefOrInRange(SDValue Op, unsigned Low, unsigned Hi) {
2033  if (Op.getOpcode() == ISD::UNDEF)
2034    return true;
2035
2036  unsigned Val = cast<ConstantSDNode>(Op)->getZExtValue();
2037  return (Val >= Low && Val < Hi);
2038}
2039
2040/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode.  Return
2041/// true if Op is undef or if its value equal to the specified value.
2042static bool isUndefOrEqual(SDValue Op, unsigned Val) {
2043  if (Op.getOpcode() == ISD::UNDEF)
2044    return true;
2045  return cast<ConstantSDNode>(Op)->getZExtValue() == Val;
2046}
2047
2048/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
2049/// specifies a shuffle of elements that is suitable for input to PSHUFD.
2050bool X86::isPSHUFDMask(SDNode *N) {
2051  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2052
2053  if (N->getNumOperands() != 2 && N->getNumOperands() != 4)
2054    return false;
2055
2056  // Check if the value doesn't reference the second vector.
2057  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2058    SDValue Arg = N->getOperand(i);
2059    if (Arg.getOpcode() == ISD::UNDEF) continue;
2060    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2061    if (cast<ConstantSDNode>(Arg)->getZExtValue() >= e)
2062      return false;
2063  }
2064
2065  return true;
2066}
2067
2068/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
2069/// specifies a shuffle of elements that is suitable for input to PSHUFHW.
2070bool X86::isPSHUFHWMask(SDNode *N) {
2071  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2072
2073  if (N->getNumOperands() != 8)
2074    return false;
2075
2076  // Lower quadword copied in order.
2077  for (unsigned i = 0; i != 4; ++i) {
2078    SDValue Arg = N->getOperand(i);
2079    if (Arg.getOpcode() == ISD::UNDEF) continue;
2080    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2081    if (cast<ConstantSDNode>(Arg)->getZExtValue() != i)
2082      return false;
2083  }
2084
2085  // Upper quadword shuffled.
2086  for (unsigned i = 4; i != 8; ++i) {
2087    SDValue Arg = N->getOperand(i);
2088    if (Arg.getOpcode() == ISD::UNDEF) continue;
2089    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2090    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2091    if (Val < 4 || Val > 7)
2092      return false;
2093  }
2094
2095  return true;
2096}
2097
2098/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
2099/// specifies a shuffle of elements that is suitable for input to PSHUFLW.
2100bool X86::isPSHUFLWMask(SDNode *N) {
2101  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2102
2103  if (N->getNumOperands() != 8)
2104    return false;
2105
2106  // Upper quadword copied in order.
2107  for (unsigned i = 4; i != 8; ++i)
2108    if (!isUndefOrEqual(N->getOperand(i), i))
2109      return false;
2110
2111  // Lower quadword shuffled.
2112  for (unsigned i = 0; i != 4; ++i)
2113    if (!isUndefOrInRange(N->getOperand(i), 0, 4))
2114      return false;
2115
2116  return true;
2117}
2118
2119/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2120/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2121static bool isSHUFPMask(SDOperandPtr Elems, unsigned NumElems) {
2122  if (NumElems != 2 && NumElems != 4) return false;
2123
2124  unsigned Half = NumElems / 2;
2125  for (unsigned i = 0; i < Half; ++i)
2126    if (!isUndefOrInRange(Elems[i], 0, NumElems))
2127      return false;
2128  for (unsigned i = Half; i < NumElems; ++i)
2129    if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2))
2130      return false;
2131
2132  return true;
2133}
2134
2135bool X86::isSHUFPMask(SDNode *N) {
2136  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2137  return ::isSHUFPMask(N->op_begin(), N->getNumOperands());
2138}
2139
2140/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2141/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2142/// half elements to come from vector 1 (which would equal the dest.) and
2143/// the upper half to come from vector 2.
2144static bool isCommutedSHUFP(SDOperandPtr Ops, unsigned NumOps) {
2145  if (NumOps != 2 && NumOps != 4) return false;
2146
2147  unsigned Half = NumOps / 2;
2148  for (unsigned i = 0; i < Half; ++i)
2149    if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2))
2150      return false;
2151  for (unsigned i = Half; i < NumOps; ++i)
2152    if (!isUndefOrInRange(Ops[i], 0, NumOps))
2153      return false;
2154  return true;
2155}
2156
2157static bool isCommutedSHUFP(SDNode *N) {
2158  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2159  return isCommutedSHUFP(N->op_begin(), N->getNumOperands());
2160}
2161
2162/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2163/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2164bool X86::isMOVHLPSMask(SDNode *N) {
2165  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2166
2167  if (N->getNumOperands() != 4)
2168    return false;
2169
2170  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2171  return isUndefOrEqual(N->getOperand(0), 6) &&
2172         isUndefOrEqual(N->getOperand(1), 7) &&
2173         isUndefOrEqual(N->getOperand(2), 2) &&
2174         isUndefOrEqual(N->getOperand(3), 3);
2175}
2176
2177/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2178/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2179/// <2, 3, 2, 3>
2180bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) {
2181  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2182
2183  if (N->getNumOperands() != 4)
2184    return false;
2185
2186  // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3
2187  return isUndefOrEqual(N->getOperand(0), 2) &&
2188         isUndefOrEqual(N->getOperand(1), 3) &&
2189         isUndefOrEqual(N->getOperand(2), 2) &&
2190         isUndefOrEqual(N->getOperand(3), 3);
2191}
2192
2193/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2194/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2195bool X86::isMOVLPMask(SDNode *N) {
2196  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2197
2198  unsigned NumElems = N->getNumOperands();
2199  if (NumElems != 2 && NumElems != 4)
2200    return false;
2201
2202  for (unsigned i = 0; i < NumElems/2; ++i)
2203    if (!isUndefOrEqual(N->getOperand(i), i + NumElems))
2204      return false;
2205
2206  for (unsigned i = NumElems/2; i < NumElems; ++i)
2207    if (!isUndefOrEqual(N->getOperand(i), i))
2208      return false;
2209
2210  return true;
2211}
2212
2213/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
2214/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
2215/// and MOVLHPS.
2216bool X86::isMOVHPMask(SDNode *N) {
2217  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2218
2219  unsigned NumElems = N->getNumOperands();
2220  if (NumElems != 2 && NumElems != 4)
2221    return false;
2222
2223  for (unsigned i = 0; i < NumElems/2; ++i)
2224    if (!isUndefOrEqual(N->getOperand(i), i))
2225      return false;
2226
2227  for (unsigned i = 0; i < NumElems/2; ++i) {
2228    SDValue Arg = N->getOperand(i + NumElems/2);
2229    if (!isUndefOrEqual(Arg, i + NumElems))
2230      return false;
2231  }
2232
2233  return true;
2234}
2235
2236/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2237/// specifies a shuffle of elements that is suitable for input to UNPCKL.
2238bool static isUNPCKLMask(SDOperandPtr Elts, unsigned NumElts,
2239                         bool V2IsSplat = false) {
2240  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2241    return false;
2242
2243  for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
2244    SDValue BitI  = Elts[i];
2245    SDValue BitI1 = Elts[i+1];
2246    if (!isUndefOrEqual(BitI, j))
2247      return false;
2248    if (V2IsSplat) {
2249      if (isUndefOrEqual(BitI1, NumElts))
2250        return false;
2251    } else {
2252      if (!isUndefOrEqual(BitI1, j + NumElts))
2253        return false;
2254    }
2255  }
2256
2257  return true;
2258}
2259
2260bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) {
2261  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2262  return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
2263}
2264
2265/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2266/// specifies a shuffle of elements that is suitable for input to UNPCKH.
2267bool static isUNPCKHMask(SDOperandPtr Elts, unsigned NumElts,
2268                         bool V2IsSplat = false) {
2269  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2270    return false;
2271
2272  for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
2273    SDValue BitI  = Elts[i];
2274    SDValue BitI1 = Elts[i+1];
2275    if (!isUndefOrEqual(BitI, j + NumElts/2))
2276      return false;
2277    if (V2IsSplat) {
2278      if (isUndefOrEqual(BitI1, NumElts))
2279        return false;
2280    } else {
2281      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2282        return false;
2283    }
2284  }
2285
2286  return true;
2287}
2288
2289bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) {
2290  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2291  return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
2292}
2293
2294/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2295/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2296/// <0, 0, 1, 1>
2297bool X86::isUNPCKL_v_undef_Mask(SDNode *N) {
2298  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2299
2300  unsigned NumElems = N->getNumOperands();
2301  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2302    return false;
2303
2304  for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
2305    SDValue BitI  = N->getOperand(i);
2306    SDValue BitI1 = N->getOperand(i+1);
2307
2308    if (!isUndefOrEqual(BitI, j))
2309      return false;
2310    if (!isUndefOrEqual(BitI1, j))
2311      return false;
2312  }
2313
2314  return true;
2315}
2316
2317/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2318/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2319/// <2, 2, 3, 3>
2320bool X86::isUNPCKH_v_undef_Mask(SDNode *N) {
2321  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2322
2323  unsigned NumElems = N->getNumOperands();
2324  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2325    return false;
2326
2327  for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2328    SDValue BitI  = N->getOperand(i);
2329    SDValue BitI1 = N->getOperand(i + 1);
2330
2331    if (!isUndefOrEqual(BitI, j))
2332      return false;
2333    if (!isUndefOrEqual(BitI1, j))
2334      return false;
2335  }
2336
2337  return true;
2338}
2339
2340/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2341/// specifies a shuffle of elements that is suitable for input to MOVSS,
2342/// MOVSD, and MOVD, i.e. setting the lowest element.
2343static bool isMOVLMask(SDOperandPtr Elts, unsigned NumElts) {
2344  if (NumElts != 2 && NumElts != 4)
2345    return false;
2346
2347  if (!isUndefOrEqual(Elts[0], NumElts))
2348    return false;
2349
2350  for (unsigned i = 1; i < NumElts; ++i) {
2351    if (!isUndefOrEqual(Elts[i], i))
2352      return false;
2353  }
2354
2355  return true;
2356}
2357
2358bool X86::isMOVLMask(SDNode *N) {
2359  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2360  return ::isMOVLMask(N->op_begin(), N->getNumOperands());
2361}
2362
2363/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2364/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
2365/// element of vector 2 and the other elements to come from vector 1 in order.
2366static bool isCommutedMOVL(SDOperandPtr Ops, unsigned NumOps,
2367                           bool V2IsSplat = false,
2368                           bool V2IsUndef = false) {
2369  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2370    return false;
2371
2372  if (!isUndefOrEqual(Ops[0], 0))
2373    return false;
2374
2375  for (unsigned i = 1; i < NumOps; ++i) {
2376    SDValue Arg = Ops[i];
2377    if (!(isUndefOrEqual(Arg, i+NumOps) ||
2378          (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) ||
2379          (V2IsSplat && isUndefOrEqual(Arg, NumOps))))
2380      return false;
2381  }
2382
2383  return true;
2384}
2385
2386static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false,
2387                           bool V2IsUndef = false) {
2388  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2389  return isCommutedMOVL(N->op_begin(), N->getNumOperands(),
2390                        V2IsSplat, V2IsUndef);
2391}
2392
2393/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2394/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
2395bool X86::isMOVSHDUPMask(SDNode *N) {
2396  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2397
2398  if (N->getNumOperands() != 4)
2399    return false;
2400
2401  // Expect 1, 1, 3, 3
2402  for (unsigned i = 0; i < 2; ++i) {
2403    SDValue Arg = N->getOperand(i);
2404    if (Arg.getOpcode() == ISD::UNDEF) continue;
2405    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2406    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2407    if (Val != 1) return false;
2408  }
2409
2410  bool HasHi = false;
2411  for (unsigned i = 2; i < 4; ++i) {
2412    SDValue Arg = N->getOperand(i);
2413    if (Arg.getOpcode() == ISD::UNDEF) continue;
2414    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2415    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2416    if (Val != 3) return false;
2417    HasHi = true;
2418  }
2419
2420  // Don't use movshdup if it can be done with a shufps.
2421  return HasHi;
2422}
2423
2424/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2425/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
2426bool X86::isMOVSLDUPMask(SDNode *N) {
2427  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2428
2429  if (N->getNumOperands() != 4)
2430    return false;
2431
2432  // Expect 0, 0, 2, 2
2433  for (unsigned i = 0; i < 2; ++i) {
2434    SDValue Arg = N->getOperand(i);
2435    if (Arg.getOpcode() == ISD::UNDEF) continue;
2436    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2437    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2438    if (Val != 0) return false;
2439  }
2440
2441  bool HasHi = false;
2442  for (unsigned i = 2; i < 4; ++i) {
2443    SDValue Arg = N->getOperand(i);
2444    if (Arg.getOpcode() == ISD::UNDEF) continue;
2445    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2446    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2447    if (Val != 2) return false;
2448    HasHi = true;
2449  }
2450
2451  // Don't use movshdup if it can be done with a shufps.
2452  return HasHi;
2453}
2454
2455/// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand
2456/// specifies a identity operation on the LHS or RHS.
2457static bool isIdentityMask(SDNode *N, bool RHS = false) {
2458  unsigned NumElems = N->getNumOperands();
2459  for (unsigned i = 0; i < NumElems; ++i)
2460    if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0)))
2461      return false;
2462  return true;
2463}
2464
2465/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
2466/// a splat of a single element.
2467static bool isSplatMask(SDNode *N) {
2468  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2469
2470  // This is a splat operation if each element of the permute is the same, and
2471  // if the value doesn't reference the second vector.
2472  unsigned NumElems = N->getNumOperands();
2473  SDValue ElementBase;
2474  unsigned i = 0;
2475  for (; i != NumElems; ++i) {
2476    SDValue Elt = N->getOperand(i);
2477    if (isa<ConstantSDNode>(Elt)) {
2478      ElementBase = Elt;
2479      break;
2480    }
2481  }
2482
2483  if (!ElementBase.getNode())
2484    return false;
2485
2486  for (; i != NumElems; ++i) {
2487    SDValue Arg = N->getOperand(i);
2488    if (Arg.getOpcode() == ISD::UNDEF) continue;
2489    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2490    if (Arg != ElementBase) return false;
2491  }
2492
2493  // Make sure it is a splat of the first vector operand.
2494  return cast<ConstantSDNode>(ElementBase)->getZExtValue() < NumElems;
2495}
2496
2497/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
2498/// a splat of a single element and it's a 2 or 4 element mask.
2499bool X86::isSplatMask(SDNode *N) {
2500  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2501
2502  // We can only splat 64-bit, and 32-bit quantities with a single instruction.
2503  if (N->getNumOperands() != 4 && N->getNumOperands() != 2)
2504    return false;
2505  return ::isSplatMask(N);
2506}
2507
2508/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
2509/// specifies a splat of zero element.
2510bool X86::isSplatLoMask(SDNode *N) {
2511  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2512
2513  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
2514    if (!isUndefOrEqual(N->getOperand(i), 0))
2515      return false;
2516  return true;
2517}
2518
2519/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2520/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
2521bool X86::isMOVDDUPMask(SDNode *N) {
2522  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2523
2524  unsigned e = N->getNumOperands() / 2;
2525  for (unsigned i = 0; i < e; ++i)
2526    if (!isUndefOrEqual(N->getOperand(i), i))
2527      return false;
2528  for (unsigned i = 0; i < e; ++i)
2529    if (!isUndefOrEqual(N->getOperand(e+i), i))
2530      return false;
2531  return true;
2532}
2533
2534/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
2535/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
2536/// instructions.
2537unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
2538  unsigned NumOperands = N->getNumOperands();
2539  unsigned Shift = (NumOperands == 4) ? 2 : 1;
2540  unsigned Mask = 0;
2541  for (unsigned i = 0; i < NumOperands; ++i) {
2542    unsigned Val = 0;
2543    SDValue Arg = N->getOperand(NumOperands-i-1);
2544    if (Arg.getOpcode() != ISD::UNDEF)
2545      Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2546    if (Val >= NumOperands) Val -= NumOperands;
2547    Mask |= Val;
2548    if (i != NumOperands - 1)
2549      Mask <<= Shift;
2550  }
2551
2552  return Mask;
2553}
2554
2555/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
2556/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
2557/// instructions.
2558unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
2559  unsigned Mask = 0;
2560  // 8 nodes, but we only care about the last 4.
2561  for (unsigned i = 7; i >= 4; --i) {
2562    unsigned Val = 0;
2563    SDValue Arg = N->getOperand(i);
2564    if (Arg.getOpcode() != ISD::UNDEF)
2565      Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2566    Mask |= (Val - 4);
2567    if (i != 4)
2568      Mask <<= 2;
2569  }
2570
2571  return Mask;
2572}
2573
2574/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
2575/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
2576/// instructions.
2577unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
2578  unsigned Mask = 0;
2579  // 8 nodes, but we only care about the first 4.
2580  for (int i = 3; i >= 0; --i) {
2581    unsigned Val = 0;
2582    SDValue Arg = N->getOperand(i);
2583    if (Arg.getOpcode() != ISD::UNDEF)
2584      Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2585    Mask |= Val;
2586    if (i != 0)
2587      Mask <<= 2;
2588  }
2589
2590  return Mask;
2591}
2592
2593/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
2594/// specifies a 8 element shuffle that can be broken into a pair of
2595/// PSHUFHW and PSHUFLW.
2596static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
2597  assert(N->getOpcode() == ISD::BUILD_VECTOR);
2598
2599  if (N->getNumOperands() != 8)
2600    return false;
2601
2602  // Lower quadword shuffled.
2603  for (unsigned i = 0; i != 4; ++i) {
2604    SDValue Arg = N->getOperand(i);
2605    if (Arg.getOpcode() == ISD::UNDEF) continue;
2606    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2607    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2608    if (Val >= 4)
2609      return false;
2610  }
2611
2612  // Upper quadword shuffled.
2613  for (unsigned i = 4; i != 8; ++i) {
2614    SDValue Arg = N->getOperand(i);
2615    if (Arg.getOpcode() == ISD::UNDEF) continue;
2616    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2617    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2618    if (Val < 4 || Val > 7)
2619      return false;
2620  }
2621
2622  return true;
2623}
2624
2625/// CommuteVectorShuffle - Swap vector_shuffle operands as well as
2626/// values in ther permute mask.
2627static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1,
2628                                      SDValue &V2, SDValue &Mask,
2629                                      SelectionDAG &DAG) {
2630  MVT VT = Op.getValueType();
2631  MVT MaskVT = Mask.getValueType();
2632  MVT EltVT = MaskVT.getVectorElementType();
2633  unsigned NumElems = Mask.getNumOperands();
2634  SmallVector<SDValue, 8> MaskVec;
2635
2636  for (unsigned i = 0; i != NumElems; ++i) {
2637    SDValue Arg = Mask.getOperand(i);
2638    if (Arg.getOpcode() == ISD::UNDEF) {
2639      MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT));
2640      continue;
2641    }
2642    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2643    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2644    if (Val < NumElems)
2645      MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
2646    else
2647      MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
2648  }
2649
2650  std::swap(V1, V2);
2651  Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], NumElems);
2652  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
2653}
2654
2655/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
2656/// the two vector operands have swapped position.
2657static
2658SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG) {
2659  MVT MaskVT = Mask.getValueType();
2660  MVT EltVT = MaskVT.getVectorElementType();
2661  unsigned NumElems = Mask.getNumOperands();
2662  SmallVector<SDValue, 8> MaskVec;
2663  for (unsigned i = 0; i != NumElems; ++i) {
2664    SDValue Arg = Mask.getOperand(i);
2665    if (Arg.getOpcode() == ISD::UNDEF) {
2666      MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT));
2667      continue;
2668    }
2669    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
2670    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2671    if (Val < NumElems)
2672      MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
2673    else
2674      MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
2675  }
2676  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], NumElems);
2677}
2678
2679
2680/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
2681/// match movhlps. The lower half elements should come from upper half of
2682/// V1 (and in order), and the upper half elements should come from the upper
2683/// half of V2 (and in order).
2684static bool ShouldXformToMOVHLPS(SDNode *Mask) {
2685  unsigned NumElems = Mask->getNumOperands();
2686  if (NumElems != 4)
2687    return false;
2688  for (unsigned i = 0, e = 2; i != e; ++i)
2689    if (!isUndefOrEqual(Mask->getOperand(i), i+2))
2690      return false;
2691  for (unsigned i = 2; i != 4; ++i)
2692    if (!isUndefOrEqual(Mask->getOperand(i), i+4))
2693      return false;
2694  return true;
2695}
2696
2697/// isScalarLoadToVector - Returns true if the node is a scalar load that
2698/// is promoted to a vector. It also returns the LoadSDNode by reference if
2699/// required.
2700static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
2701  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
2702    return false;
2703  N = N->getOperand(0).getNode();
2704  if (!ISD::isNON_EXTLoad(N))
2705    return false;
2706  if (LD)
2707    *LD = cast<LoadSDNode>(N);
2708  return true;
2709}
2710
2711/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
2712/// match movlp{s|d}. The lower half elements should come from lower half of
2713/// V1 (and in order), and the upper half elements should come from the upper
2714/// half of V2 (and in order). And since V1 will become the source of the
2715/// MOVLP, it must be either a vector load or a scalar load to vector.
2716static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) {
2717  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
2718    return false;
2719  // Is V2 is a vector load, don't do this transformation. We will try to use
2720  // load folding shufps op.
2721  if (ISD::isNON_EXTLoad(V2))
2722    return false;
2723
2724  unsigned NumElems = Mask->getNumOperands();
2725  if (NumElems != 2 && NumElems != 4)
2726    return false;
2727  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
2728    if (!isUndefOrEqual(Mask->getOperand(i), i))
2729      return false;
2730  for (unsigned i = NumElems/2; i != NumElems; ++i)
2731    if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems))
2732      return false;
2733  return true;
2734}
2735
2736/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
2737/// all the same.
2738static bool isSplatVector(SDNode *N) {
2739  if (N->getOpcode() != ISD::BUILD_VECTOR)
2740    return false;
2741
2742  SDValue SplatValue = N->getOperand(0);
2743  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
2744    if (N->getOperand(i) != SplatValue)
2745      return false;
2746  return true;
2747}
2748
2749/// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
2750/// to an undef.
2751static bool isUndefShuffle(SDNode *N) {
2752  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
2753    return false;
2754
2755  SDValue V1 = N->getOperand(0);
2756  SDValue V2 = N->getOperand(1);
2757  SDValue Mask = N->getOperand(2);
2758  unsigned NumElems = Mask.getNumOperands();
2759  for (unsigned i = 0; i != NumElems; ++i) {
2760    SDValue Arg = Mask.getOperand(i);
2761    if (Arg.getOpcode() != ISD::UNDEF) {
2762      unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2763      if (Val < NumElems && V1.getOpcode() != ISD::UNDEF)
2764        return false;
2765      else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF)
2766        return false;
2767    }
2768  }
2769  return true;
2770}
2771
2772/// isZeroNode - Returns true if Elt is a constant zero or a floating point
2773/// constant +0.0.
2774static inline bool isZeroNode(SDValue Elt) {
2775  return ((isa<ConstantSDNode>(Elt) &&
2776           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
2777          (isa<ConstantFPSDNode>(Elt) &&
2778           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
2779}
2780
2781/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
2782/// to an zero vector.
2783static bool isZeroShuffle(SDNode *N) {
2784  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
2785    return false;
2786
2787  SDValue V1 = N->getOperand(0);
2788  SDValue V2 = N->getOperand(1);
2789  SDValue Mask = N->getOperand(2);
2790  unsigned NumElems = Mask.getNumOperands();
2791  for (unsigned i = 0; i != NumElems; ++i) {
2792    SDValue Arg = Mask.getOperand(i);
2793    if (Arg.getOpcode() == ISD::UNDEF)
2794      continue;
2795
2796    unsigned Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
2797    if (Idx < NumElems) {
2798      unsigned Opc = V1.getNode()->getOpcode();
2799      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
2800        continue;
2801      if (Opc != ISD::BUILD_VECTOR ||
2802          !isZeroNode(V1.getNode()->getOperand(Idx)))
2803        return false;
2804    } else if (Idx >= NumElems) {
2805      unsigned Opc = V2.getNode()->getOpcode();
2806      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
2807        continue;
2808      if (Opc != ISD::BUILD_VECTOR ||
2809          !isZeroNode(V2.getNode()->getOperand(Idx - NumElems)))
2810        return false;
2811    }
2812  }
2813  return true;
2814}
2815
2816/// getZeroVector - Returns a vector of specified type with all zero elements.
2817///
2818static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG) {
2819  assert(VT.isVector() && "Expected a vector type");
2820
2821  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2822  // type.  This ensures they get CSE'd.
2823  SDValue Vec;
2824  if (VT.getSizeInBits() == 64) { // MMX
2825    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2826    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
2827  } else if (HasSSE2) {  // SSE2
2828    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2829    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
2830  } else { // SSE1
2831    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
2832    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4f32, Cst, Cst, Cst, Cst);
2833  }
2834  return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
2835}
2836
2837/// getOnesVector - Returns a vector of specified type with all bits set.
2838///
2839static SDValue getOnesVector(MVT VT, SelectionDAG &DAG) {
2840  assert(VT.isVector() && "Expected a vector type");
2841
2842  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2843  // type.  This ensures they get CSE'd.
2844  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
2845  SDValue Vec;
2846  if (VT.getSizeInBits() == 64)  // MMX
2847    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
2848  else                                              // SSE
2849    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
2850  return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
2851}
2852
2853
2854/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
2855/// that point to V2 points to its first element.
2856static SDValue NormalizeMask(SDValue Mask, SelectionDAG &DAG) {
2857  assert(Mask.getOpcode() == ISD::BUILD_VECTOR);
2858
2859  bool Changed = false;
2860  SmallVector<SDValue, 8> MaskVec;
2861  unsigned NumElems = Mask.getNumOperands();
2862  for (unsigned i = 0; i != NumElems; ++i) {
2863    SDValue Arg = Mask.getOperand(i);
2864    if (Arg.getOpcode() != ISD::UNDEF) {
2865      unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
2866      if (Val > NumElems) {
2867        Arg = DAG.getConstant(NumElems, Arg.getValueType());
2868        Changed = true;
2869      }
2870    }
2871    MaskVec.push_back(Arg);
2872  }
2873
2874  if (Changed)
2875    Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
2876                       &MaskVec[0], MaskVec.size());
2877  return Mask;
2878}
2879
2880/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
2881/// operation of specified width.
2882static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG) {
2883  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
2884  MVT BaseVT = MaskVT.getVectorElementType();
2885
2886  SmallVector<SDValue, 8> MaskVec;
2887  MaskVec.push_back(DAG.getConstant(NumElems, BaseVT));
2888  for (unsigned i = 1; i != NumElems; ++i)
2889    MaskVec.push_back(DAG.getConstant(i, BaseVT));
2890  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
2891}
2892
2893/// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation
2894/// of specified width.
2895static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) {
2896  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
2897  MVT BaseVT = MaskVT.getVectorElementType();
2898  SmallVector<SDValue, 8> MaskVec;
2899  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
2900    MaskVec.push_back(DAG.getConstant(i,            BaseVT));
2901    MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
2902  }
2903  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
2904}
2905
2906/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
2907/// of specified width.
2908static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) {
2909  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
2910  MVT BaseVT = MaskVT.getVectorElementType();
2911  unsigned Half = NumElems/2;
2912  SmallVector<SDValue, 8> MaskVec;
2913  for (unsigned i = 0; i != Half; ++i) {
2914    MaskVec.push_back(DAG.getConstant(i + Half,            BaseVT));
2915    MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
2916  }
2917  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
2918}
2919
2920/// getSwapEltZeroMask - Returns a vector_shuffle mask for a shuffle that swaps
2921/// element #0 of a vector with the specified index, leaving the rest of the
2922/// elements in place.
2923static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt,
2924                                   SelectionDAG &DAG) {
2925  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
2926  MVT BaseVT = MaskVT.getVectorElementType();
2927  SmallVector<SDValue, 8> MaskVec;
2928  // Element #0 of the result gets the elt we are replacing.
2929  MaskVec.push_back(DAG.getConstant(DestElt, BaseVT));
2930  for (unsigned i = 1; i != NumElems; ++i)
2931    MaskVec.push_back(DAG.getConstant(i == DestElt ? 0 : i, BaseVT));
2932  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
2933}
2934
2935/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
2936static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
2937  MVT PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
2938  MVT VT = Op.getValueType();
2939  if (PVT == VT)
2940    return Op;
2941  SDValue V1 = Op.getOperand(0);
2942  SDValue Mask = Op.getOperand(2);
2943  unsigned NumElems = Mask.getNumOperands();
2944  // Special handling of v4f32 -> v4i32.
2945  if (VT != MVT::v4f32) {
2946    Mask = getUnpacklMask(NumElems, DAG);
2947    while (NumElems > 4) {
2948      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
2949      NumElems >>= 1;
2950    }
2951    Mask = getZeroVector(MVT::v4i32, true, DAG);
2952  }
2953
2954  V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
2955  SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1,
2956                                  DAG.getNode(ISD::UNDEF, PVT), Mask);
2957  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
2958}
2959
2960/// isVectorLoad - Returns true if the node is a vector load, a scalar
2961/// load that's promoted to vector, or a load bitcasted.
2962static bool isVectorLoad(SDValue Op) {
2963  assert(Op.getValueType().isVector() && "Expected a vector type");
2964  if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR ||
2965      Op.getOpcode() == ISD::BIT_CONVERT) {
2966    return isa<LoadSDNode>(Op.getOperand(0));
2967  }
2968  return isa<LoadSDNode>(Op);
2969}
2970
2971
2972/// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64.
2973///
2974static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask,
2975                                   SelectionDAG &DAG, bool HasSSE3) {
2976  // If we have sse3 and shuffle has more than one use or input is a load, then
2977  // use movddup. Otherwise, use movlhps.
2978  bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1));
2979  MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32;
2980  MVT VT = Op.getValueType();
2981  if (VT == PVT)
2982    return Op;
2983  unsigned NumElems = PVT.getVectorNumElements();
2984  if (NumElems == 2) {
2985    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2986    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
2987  } else {
2988    assert(NumElems == 4);
2989    SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32);
2990    SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32);
2991    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst0, Cst1, Cst0, Cst1);
2992  }
2993
2994  V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
2995  SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1,
2996                                DAG.getNode(ISD::UNDEF, PVT), Mask);
2997  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
2998}
2999
3000/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3001/// vector of zero or undef vector.  This produces a shuffle where the low
3002/// element of V2 is swizzled into the zero/undef vector, landing at element
3003/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3004static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3005                                             bool isZero, bool HasSSE2,
3006                                             SelectionDAG &DAG) {
3007  MVT VT = V2.getValueType();
3008  SDValue V1 = isZero
3009    ? getZeroVector(VT, HasSSE2, DAG) : DAG.getNode(ISD::UNDEF, VT);
3010  unsigned NumElems = V2.getValueType().getVectorNumElements();
3011  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3012  MVT EVT = MaskVT.getVectorElementType();
3013  SmallVector<SDValue, 16> MaskVec;
3014  for (unsigned i = 0; i != NumElems; ++i)
3015    if (i == Idx)  // If this is the insertion idx, put the low elt of V2 here.
3016      MaskVec.push_back(DAG.getConstant(NumElems, EVT));
3017    else
3018      MaskVec.push_back(DAG.getConstant(i, EVT));
3019  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3020                               &MaskVec[0], MaskVec.size());
3021  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
3022}
3023
3024/// getNumOfConsecutiveZeros - Return the number of elements in a result of
3025/// a shuffle that is zero.
3026static
3027unsigned getNumOfConsecutiveZeros(SDValue Op, SDValue Mask,
3028                                  unsigned NumElems, bool Low,
3029                                  SelectionDAG &DAG) {
3030  unsigned NumZeros = 0;
3031  for (unsigned i = 0; i < NumElems; ++i) {
3032    unsigned Index = Low ? i : NumElems-i-1;
3033    SDValue Idx = Mask.getOperand(Index);
3034    if (Idx.getOpcode() == ISD::UNDEF) {
3035      ++NumZeros;
3036      continue;
3037    }
3038    SDValue Elt = DAG.getShuffleScalarElt(Op.getNode(), Index);
3039    if (Elt.getNode() && isZeroNode(Elt))
3040      ++NumZeros;
3041    else
3042      break;
3043  }
3044  return NumZeros;
3045}
3046
3047/// isVectorShift - Returns true if the shuffle can be implemented as a
3048/// logical left or right shift of a vector.
3049static bool isVectorShift(SDValue Op, SDValue Mask, SelectionDAG &DAG,
3050                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3051  unsigned NumElems = Mask.getNumOperands();
3052
3053  isLeft = true;
3054  unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
3055  if (!NumZeros) {
3056    isLeft = false;
3057    NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
3058    if (!NumZeros)
3059      return false;
3060  }
3061
3062  bool SeenV1 = false;
3063  bool SeenV2 = false;
3064  for (unsigned i = NumZeros; i < NumElems; ++i) {
3065    unsigned Val = isLeft ? (i - NumZeros) : i;
3066    SDValue Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
3067    if (Idx.getOpcode() == ISD::UNDEF)
3068      continue;
3069    unsigned Index = cast<ConstantSDNode>(Idx)->getZExtValue();
3070    if (Index < NumElems)
3071      SeenV1 = true;
3072    else {
3073      Index -= NumElems;
3074      SeenV2 = true;
3075    }
3076    if (Index != Val)
3077      return false;
3078  }
3079  if (SeenV1 && SeenV2)
3080    return false;
3081
3082  ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
3083  ShAmt = NumZeros;
3084  return true;
3085}
3086
3087
3088/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3089///
3090static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3091                                       unsigned NumNonZero, unsigned NumZero,
3092                                       SelectionDAG &DAG, TargetLowering &TLI) {
3093  if (NumNonZero > 8)
3094    return SDValue();
3095
3096  SDValue V(0, 0);
3097  bool First = true;
3098  for (unsigned i = 0; i < 16; ++i) {
3099    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3100    if (ThisIsNonZero && First) {
3101      if (NumZero)
3102        V = getZeroVector(MVT::v8i16, true, DAG);
3103      else
3104        V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
3105      First = false;
3106    }
3107
3108    if ((i & 1) != 0) {
3109      SDValue ThisElt(0, 0), LastElt(0, 0);
3110      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3111      if (LastIsNonZero) {
3112        LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1));
3113      }
3114      if (ThisIsNonZero) {
3115        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i));
3116        ThisElt = DAG.getNode(ISD::SHL, MVT::i16,
3117                              ThisElt, DAG.getConstant(8, MVT::i8));
3118        if (LastIsNonZero)
3119          ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt);
3120      } else
3121        ThisElt = LastElt;
3122
3123      if (ThisElt.getNode())
3124        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt,
3125                        DAG.getIntPtrConstant(i/2));
3126    }
3127  }
3128
3129  return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V);
3130}
3131
3132/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3133///
3134static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3135                                       unsigned NumNonZero, unsigned NumZero,
3136                                       SelectionDAG &DAG, TargetLowering &TLI) {
3137  if (NumNonZero > 4)
3138    return SDValue();
3139
3140  SDValue V(0, 0);
3141  bool First = true;
3142  for (unsigned i = 0; i < 8; ++i) {
3143    bool isNonZero = (NonZeros & (1 << i)) != 0;
3144    if (isNonZero) {
3145      if (First) {
3146        if (NumZero)
3147          V = getZeroVector(MVT::v8i16, true, DAG);
3148        else
3149          V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
3150        First = false;
3151      }
3152      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i),
3153                      DAG.getIntPtrConstant(i));
3154    }
3155  }
3156
3157  return V;
3158}
3159
3160/// getVShift - Return a vector logical shift node.
3161///
3162static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
3163                           unsigned NumBits, SelectionDAG &DAG,
3164                           const TargetLowering &TLI) {
3165  bool isMMX = VT.getSizeInBits() == 64;
3166  MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3167  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3168  SrcOp = DAG.getNode(ISD::BIT_CONVERT, ShVT, SrcOp);
3169  return DAG.getNode(ISD::BIT_CONVERT, VT,
3170                     DAG.getNode(Opc, ShVT, SrcOp,
3171                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3172}
3173
3174SDValue
3175X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3176  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3177  if (ISD::isBuildVectorAllZeros(Op.getNode())
3178      || ISD::isBuildVectorAllOnes(Op.getNode())) {
3179    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3180    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3181    // eliminated on x86-32 hosts.
3182    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3183      return Op;
3184
3185    if (ISD::isBuildVectorAllOnes(Op.getNode()))
3186      return getOnesVector(Op.getValueType(), DAG);
3187    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG);
3188  }
3189
3190  MVT VT = Op.getValueType();
3191  MVT EVT = VT.getVectorElementType();
3192  unsigned EVTBits = EVT.getSizeInBits();
3193
3194  unsigned NumElems = Op.getNumOperands();
3195  unsigned NumZero  = 0;
3196  unsigned NumNonZero = 0;
3197  unsigned NonZeros = 0;
3198  bool IsAllConstants = true;
3199  SmallSet<SDValue, 8> Values;
3200  for (unsigned i = 0; i < NumElems; ++i) {
3201    SDValue Elt = Op.getOperand(i);
3202    if (Elt.getOpcode() == ISD::UNDEF)
3203      continue;
3204    Values.insert(Elt);
3205    if (Elt.getOpcode() != ISD::Constant &&
3206        Elt.getOpcode() != ISD::ConstantFP)
3207      IsAllConstants = false;
3208    if (isZeroNode(Elt))
3209      NumZero++;
3210    else {
3211      NonZeros |= (1 << i);
3212      NumNonZero++;
3213    }
3214  }
3215
3216  if (NumNonZero == 0) {
3217    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3218    return DAG.getNode(ISD::UNDEF, VT);
3219  }
3220
3221  // Special case for single non-zero, non-undef, element.
3222  if (NumNonZero == 1 && NumElems <= 4) {
3223    unsigned Idx = CountTrailingZeros_32(NonZeros);
3224    SDValue Item = Op.getOperand(Idx);
3225
3226    // If this is an insertion of an i64 value on x86-32, and if the top bits of
3227    // the value are obviously zero, truncate the value to i32 and do the
3228    // insertion that way.  Only do this if the value is non-constant or if the
3229    // value is a constant being inserted into element 0.  It is cheaper to do
3230    // a constant pool load than it is to do a movd + shuffle.
3231    if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
3232        (!IsAllConstants || Idx == 0)) {
3233      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3234        // Handle MMX and SSE both.
3235        MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3236        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3237
3238        // Truncate the value (which may itself be a constant) to i32, and
3239        // convert it to a vector with movd (S2V+shuffle to zero extend).
3240        Item = DAG.getNode(ISD::TRUNCATE, MVT::i32, Item);
3241        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VecVT, Item);
3242        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3243                                           Subtarget->hasSSE2(), DAG);
3244
3245        // Now we have our 32-bit value zero extended in the low element of
3246        // a vector.  If Idx != 0, swizzle it into place.
3247        if (Idx != 0) {
3248          SDValue Ops[] = {
3249            Item, DAG.getNode(ISD::UNDEF, Item.getValueType()),
3250            getSwapEltZeroMask(VecElts, Idx, DAG)
3251          };
3252          Item = DAG.getNode(ISD::VECTOR_SHUFFLE, VecVT, Ops, 3);
3253        }
3254        return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Item);
3255      }
3256    }
3257
3258    // If we have a constant or non-constant insertion into the low element of
3259    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3260    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3261    // depending on what the source datatype is.  Because we can only get here
3262    // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
3263    if (Idx == 0 &&
3264        // Don't do this for i64 values on x86-32.
3265        (EVT != MVT::i64 || Subtarget->is64Bit())) {
3266      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
3267      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3268      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3269                                         Subtarget->hasSSE2(), DAG);
3270    }
3271
3272    // Is it a vector logical left shift?
3273    if (NumElems == 2 && Idx == 1 &&
3274        isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
3275      unsigned NumBits = VT.getSizeInBits();
3276      return getVShift(true, VT,
3277                       DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(1)),
3278                       NumBits/2, DAG, *this);
3279    }
3280
3281    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3282      return SDValue();
3283
3284    // Otherwise, if this is a vector with i32 or f32 elements, and the element
3285    // is a non-constant being inserted into an element other than the low one,
3286    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3287    // movd/movss) to move this into the low element, then shuffle it into
3288    // place.
3289    if (EVTBits == 32) {
3290      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
3291
3292      // Turn it into a shuffle of zero and zero-extended scalar to vector.
3293      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3294                                         Subtarget->hasSSE2(), DAG);
3295      MVT MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
3296      MVT MaskEVT = MaskVT.getVectorElementType();
3297      SmallVector<SDValue, 8> MaskVec;
3298      for (unsigned i = 0; i < NumElems; i++)
3299        MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
3300      SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3301                                   &MaskVec[0], MaskVec.size());
3302      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
3303                         DAG.getNode(ISD::UNDEF, VT), Mask);
3304    }
3305  }
3306
3307  // Splat is obviously ok. Let legalizer expand it to a shuffle.
3308  if (Values.size() == 1)
3309    return SDValue();
3310
3311  // A vector full of immediates; various special cases are already
3312  // handled, so this is best done with a single constant-pool load.
3313  if (IsAllConstants)
3314    return SDValue();
3315
3316  // Let legalizer expand 2-wide build_vectors.
3317  if (EVTBits == 64) {
3318    if (NumNonZero == 1) {
3319      // One half is zero or undef.
3320      unsigned Idx = CountTrailingZeros_32(NonZeros);
3321      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT,
3322                                 Op.getOperand(Idx));
3323      return getShuffleVectorZeroOrUndef(V2, Idx, true,
3324                                         Subtarget->hasSSE2(), DAG);
3325    }
3326    return SDValue();
3327  }
3328
3329  // If element VT is < 32 bits, convert it to inserts into a zero vector.
3330  if (EVTBits == 8 && NumElems == 16) {
3331    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3332                                        *this);
3333    if (V.getNode()) return V;
3334  }
3335
3336  if (EVTBits == 16 && NumElems == 8) {
3337    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3338                                        *this);
3339    if (V.getNode()) return V;
3340  }
3341
3342  // If element VT is == 32 bits, turn it into a number of shuffles.
3343  SmallVector<SDValue, 8> V;
3344  V.resize(NumElems);
3345  if (NumElems == 4 && NumZero > 0) {
3346    for (unsigned i = 0; i < 4; ++i) {
3347      bool isZero = !(NonZeros & (1 << i));
3348      if (isZero)
3349        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG);
3350      else
3351        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
3352    }
3353
3354    for (unsigned i = 0; i < 2; ++i) {
3355      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3356        default: break;
3357        case 0:
3358          V[i] = V[i*2];  // Must be a zero vector.
3359          break;
3360        case 1:
3361          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
3362                             getMOVLMask(NumElems, DAG));
3363          break;
3364        case 2:
3365          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
3366                             getMOVLMask(NumElems, DAG));
3367          break;
3368        case 3:
3369          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
3370                             getUnpacklMask(NumElems, DAG));
3371          break;
3372      }
3373    }
3374
3375    MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
3376    MVT EVT = MaskVT.getVectorElementType();
3377    SmallVector<SDValue, 8> MaskVec;
3378    bool Reverse = (NonZeros & 0x3) == 2;
3379    for (unsigned i = 0; i < 2; ++i)
3380      if (Reverse)
3381        MaskVec.push_back(DAG.getConstant(1-i, EVT));
3382      else
3383        MaskVec.push_back(DAG.getConstant(i, EVT));
3384    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3385    for (unsigned i = 0; i < 2; ++i)
3386      if (Reverse)
3387        MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
3388      else
3389        MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
3390    SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3391                                     &MaskVec[0], MaskVec.size());
3392    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
3393  }
3394
3395  if (Values.size() > 2) {
3396    // Expand into a number of unpckl*.
3397    // e.g. for v4f32
3398    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3399    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3400    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
3401    SDValue UnpckMask = getUnpacklMask(NumElems, DAG);
3402    for (unsigned i = 0; i < NumElems; ++i)
3403      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
3404    NumElems >>= 1;
3405    while (NumElems != 0) {
3406      for (unsigned i = 0; i < NumElems; ++i)
3407        V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
3408                           UnpckMask);
3409      NumElems >>= 1;
3410    }
3411    return V[0];
3412  }
3413
3414  return SDValue();
3415}
3416
3417static
3418SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
3419                                 SDValue PermMask, SelectionDAG &DAG,
3420                                 TargetLowering &TLI) {
3421  SDValue NewV;
3422  MVT MaskVT = MVT::getIntVectorWithNumElements(8);
3423  MVT MaskEVT = MaskVT.getVectorElementType();
3424  MVT PtrVT = TLI.getPointerTy();
3425  SmallVector<SDValue, 8> MaskElts(PermMask.getNode()->op_begin(),
3426                                   PermMask.getNode()->op_end());
3427
3428  // First record which half of which vector the low elements come from.
3429  SmallVector<unsigned, 4> LowQuad(4);
3430  for (unsigned i = 0; i < 4; ++i) {
3431    SDValue Elt = MaskElts[i];
3432    if (Elt.getOpcode() == ISD::UNDEF)
3433      continue;
3434    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3435    int QuadIdx = EltIdx / 4;
3436    ++LowQuad[QuadIdx];
3437  }
3438
3439  int BestLowQuad = -1;
3440  unsigned MaxQuad = 1;
3441  for (unsigned i = 0; i < 4; ++i) {
3442    if (LowQuad[i] > MaxQuad) {
3443      BestLowQuad = i;
3444      MaxQuad = LowQuad[i];
3445    }
3446  }
3447
3448  // Record which half of which vector the high elements come from.
3449  SmallVector<unsigned, 4> HighQuad(4);
3450  for (unsigned i = 4; i < 8; ++i) {
3451    SDValue Elt = MaskElts[i];
3452    if (Elt.getOpcode() == ISD::UNDEF)
3453      continue;
3454    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3455    int QuadIdx = EltIdx / 4;
3456    ++HighQuad[QuadIdx];
3457  }
3458
3459  int BestHighQuad = -1;
3460  MaxQuad = 1;
3461  for (unsigned i = 0; i < 4; ++i) {
3462    if (HighQuad[i] > MaxQuad) {
3463      BestHighQuad = i;
3464      MaxQuad = HighQuad[i];
3465    }
3466  }
3467
3468  // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it.
3469  if (BestLowQuad != -1 || BestHighQuad != -1) {
3470    // First sort the 4 chunks in order using shufpd.
3471    SmallVector<SDValue, 8> MaskVec;
3472
3473    if (BestLowQuad != -1)
3474      MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32));
3475    else
3476      MaskVec.push_back(DAG.getConstant(0, MVT::i32));
3477
3478    if (BestHighQuad != -1)
3479      MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32));
3480    else
3481      MaskVec.push_back(DAG.getConstant(1, MVT::i32));
3482
3483    SDValue Mask= DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec[0],2);
3484    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
3485                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V1),
3486                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V2), Mask);
3487    NewV = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, NewV);
3488
3489    // Now sort high and low parts separately.
3490    BitVector InOrder(8);
3491    if (BestLowQuad != -1) {
3492      // Sort lower half in order using PSHUFLW.
3493      MaskVec.clear();
3494      bool AnyOutOrder = false;
3495
3496      for (unsigned i = 0; i != 4; ++i) {
3497        SDValue Elt = MaskElts[i];
3498        if (Elt.getOpcode() == ISD::UNDEF) {
3499          MaskVec.push_back(Elt);
3500          InOrder.set(i);
3501        } else {
3502          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3503          if (EltIdx != i)
3504            AnyOutOrder = true;
3505
3506          MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT));
3507
3508          // If this element is in the right place after this shuffle, then
3509          // remember it.
3510          if ((int)(EltIdx / 4) == BestLowQuad)
3511            InOrder.set(i);
3512        }
3513      }
3514      if (AnyOutOrder) {
3515        for (unsigned i = 4; i != 8; ++i)
3516          MaskVec.push_back(DAG.getConstant(i, MaskEVT));
3517        SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
3518        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
3519      }
3520    }
3521
3522    if (BestHighQuad != -1) {
3523      // Sort high half in order using PSHUFHW if possible.
3524      MaskVec.clear();
3525
3526      for (unsigned i = 0; i != 4; ++i)
3527        MaskVec.push_back(DAG.getConstant(i, MaskEVT));
3528
3529      bool AnyOutOrder = false;
3530      for (unsigned i = 4; i != 8; ++i) {
3531        SDValue Elt = MaskElts[i];
3532        if (Elt.getOpcode() == ISD::UNDEF) {
3533          MaskVec.push_back(Elt);
3534          InOrder.set(i);
3535        } else {
3536          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3537          if (EltIdx != i)
3538            AnyOutOrder = true;
3539
3540          MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT));
3541
3542          // If this element is in the right place after this shuffle, then
3543          // remember it.
3544          if ((int)(EltIdx / 4) == BestHighQuad)
3545            InOrder.set(i);
3546        }
3547      }
3548
3549      if (AnyOutOrder) {
3550        SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
3551        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
3552      }
3553    }
3554
3555    // The other elements are put in the right place using pextrw and pinsrw.
3556    for (unsigned i = 0; i != 8; ++i) {
3557      if (InOrder[i])
3558        continue;
3559      SDValue Elt = MaskElts[i];
3560      if (Elt.getOpcode() == ISD::UNDEF)
3561        continue;
3562      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3563      SDValue ExtOp = (EltIdx < 8)
3564        ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
3565                      DAG.getConstant(EltIdx, PtrVT))
3566        : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
3567                      DAG.getConstant(EltIdx - 8, PtrVT));
3568      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
3569                         DAG.getConstant(i, PtrVT));
3570    }
3571
3572    return NewV;
3573  }
3574
3575  // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use as
3576  // few as possible. First, let's find out how many elements are already in the
3577  // right order.
3578  unsigned V1InOrder = 0;
3579  unsigned V1FromV1 = 0;
3580  unsigned V2InOrder = 0;
3581  unsigned V2FromV2 = 0;
3582  SmallVector<SDValue, 8> V1Elts;
3583  SmallVector<SDValue, 8> V2Elts;
3584  for (unsigned i = 0; i < 8; ++i) {
3585    SDValue Elt = MaskElts[i];
3586    if (Elt.getOpcode() == ISD::UNDEF) {
3587      V1Elts.push_back(Elt);
3588      V2Elts.push_back(Elt);
3589      ++V1InOrder;
3590      ++V2InOrder;
3591      continue;
3592    }
3593    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3594    if (EltIdx == i) {
3595      V1Elts.push_back(Elt);
3596      V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
3597      ++V1InOrder;
3598    } else if (EltIdx == i+8) {
3599      V1Elts.push_back(Elt);
3600      V2Elts.push_back(DAG.getConstant(i, MaskEVT));
3601      ++V2InOrder;
3602    } else if (EltIdx < 8) {
3603      V1Elts.push_back(Elt);
3604      ++V1FromV1;
3605    } else {
3606      V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
3607      ++V2FromV2;
3608    }
3609  }
3610
3611  if (V2InOrder > V1InOrder) {
3612    PermMask = CommuteVectorShuffleMask(PermMask, DAG);
3613    std::swap(V1, V2);
3614    std::swap(V1Elts, V2Elts);
3615    std::swap(V1FromV1, V2FromV2);
3616  }
3617
3618  if ((V1FromV1 + V1InOrder) != 8) {
3619    // Some elements are from V2.
3620    if (V1FromV1) {
3621      // If there are elements that are from V1 but out of place,
3622      // then first sort them in place
3623      SmallVector<SDValue, 8> MaskVec;
3624      for (unsigned i = 0; i < 8; ++i) {
3625        SDValue Elt = V1Elts[i];
3626        if (Elt.getOpcode() == ISD::UNDEF) {
3627          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
3628          continue;
3629        }
3630        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3631        if (EltIdx >= 8)
3632          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
3633        else
3634          MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
3635      }
3636      SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
3637      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask);
3638    }
3639
3640    NewV = V1;
3641    for (unsigned i = 0; i < 8; ++i) {
3642      SDValue Elt = V1Elts[i];
3643      if (Elt.getOpcode() == ISD::UNDEF)
3644        continue;
3645      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3646      if (EltIdx < 8)
3647        continue;
3648      SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
3649                                    DAG.getConstant(EltIdx - 8, PtrVT));
3650      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
3651                         DAG.getConstant(i, PtrVT));
3652    }
3653    return NewV;
3654  } else {
3655    // All elements are from V1.
3656    NewV = V1;
3657    for (unsigned i = 0; i < 8; ++i) {
3658      SDValue Elt = V1Elts[i];
3659      if (Elt.getOpcode() == ISD::UNDEF)
3660        continue;
3661      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3662      SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
3663                                    DAG.getConstant(EltIdx, PtrVT));
3664      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
3665                         DAG.getConstant(i, PtrVT));
3666    }
3667    return NewV;
3668  }
3669}
3670
3671/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
3672/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
3673/// done when every pair / quad of shuffle mask elements point to elements in
3674/// the right sequence. e.g.
3675/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
3676static
3677SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2,
3678                                MVT VT,
3679                                SDValue PermMask, SelectionDAG &DAG,
3680                                TargetLowering &TLI) {
3681  unsigned NumElems = PermMask.getNumOperands();
3682  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
3683  MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
3684  MVT MaskEltVT = MaskVT.getVectorElementType();
3685  MVT NewVT = MaskVT;
3686  switch (VT.getSimpleVT()) {
3687  default: assert(false && "Unexpected!");
3688  case MVT::v4f32: NewVT = MVT::v2f64; break;
3689  case MVT::v4i32: NewVT = MVT::v2i64; break;
3690  case MVT::v8i16: NewVT = MVT::v4i32; break;
3691  case MVT::v16i8: NewVT = MVT::v4i32; break;
3692  }
3693
3694  if (NewWidth == 2) {
3695    if (VT.isInteger())
3696      NewVT = MVT::v2i64;
3697    else
3698      NewVT = MVT::v2f64;
3699  }
3700  unsigned Scale = NumElems / NewWidth;
3701  SmallVector<SDValue, 8> MaskVec;
3702  for (unsigned i = 0; i < NumElems; i += Scale) {
3703    unsigned StartIdx = ~0U;
3704    for (unsigned j = 0; j < Scale; ++j) {
3705      SDValue Elt = PermMask.getOperand(i+j);
3706      if (Elt.getOpcode() == ISD::UNDEF)
3707        continue;
3708      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
3709      if (StartIdx == ~0U)
3710        StartIdx = EltIdx - (EltIdx % Scale);
3711      if (EltIdx != StartIdx + j)
3712        return SDValue();
3713    }
3714    if (StartIdx == ~0U)
3715      MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEltVT));
3716    else
3717      MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MaskEltVT));
3718  }
3719
3720  V1 = DAG.getNode(ISD::BIT_CONVERT, NewVT, V1);
3721  V2 = DAG.getNode(ISD::BIT_CONVERT, NewVT, V2);
3722  return DAG.getNode(ISD::VECTOR_SHUFFLE, NewVT, V1, V2,
3723                     DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3724                                 &MaskVec[0], MaskVec.size()));
3725}
3726
3727/// getVZextMovL - Return a zero-extending vector move low node.
3728///
3729static SDValue getVZextMovL(MVT VT, MVT OpVT,
3730                              SDValue SrcOp, SelectionDAG &DAG,
3731                              const X86Subtarget *Subtarget) {
3732  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
3733    LoadSDNode *LD = NULL;
3734    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
3735      LD = dyn_cast<LoadSDNode>(SrcOp);
3736    if (!LD) {
3737      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
3738      // instead.
3739      MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
3740      if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
3741          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
3742          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
3743          SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
3744        // PR2108
3745        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
3746        return DAG.getNode(ISD::BIT_CONVERT, VT,
3747                           DAG.getNode(X86ISD::VZEXT_MOVL, OpVT,
3748                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT,
3749                                                   SrcOp.getOperand(0)
3750                                                          .getOperand(0))));
3751      }
3752    }
3753  }
3754
3755  return DAG.getNode(ISD::BIT_CONVERT, VT,
3756                     DAG.getNode(X86ISD::VZEXT_MOVL, OpVT,
3757                                 DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp)));
3758}
3759
3760/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
3761/// shuffles.
3762static SDValue
3763LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2,
3764                          SDValue PermMask, MVT VT, SelectionDAG &DAG) {
3765  MVT MaskVT = PermMask.getValueType();
3766  MVT MaskEVT = MaskVT.getVectorElementType();
3767  SmallVector<std::pair<int, int>, 8> Locs;
3768  Locs.resize(4);
3769  SmallVector<SDValue, 8> Mask1(4, DAG.getNode(ISD::UNDEF, MaskEVT));
3770  unsigned NumHi = 0;
3771  unsigned NumLo = 0;
3772  for (unsigned i = 0; i != 4; ++i) {
3773    SDValue Elt = PermMask.getOperand(i);
3774    if (Elt.getOpcode() == ISD::UNDEF) {
3775      Locs[i] = std::make_pair(-1, -1);
3776    } else {
3777      unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue();
3778      assert(Val < 8 && "Invalid VECTOR_SHUFFLE index!");
3779      if (Val < 4) {
3780        Locs[i] = std::make_pair(0, NumLo);
3781        Mask1[NumLo] = Elt;
3782        NumLo++;
3783      } else {
3784        Locs[i] = std::make_pair(1, NumHi);
3785        if (2+NumHi < 4)
3786          Mask1[2+NumHi] = Elt;
3787        NumHi++;
3788      }
3789    }
3790  }
3791
3792  if (NumLo <= 2 && NumHi <= 2) {
3793    // If no more than two elements come from either vector. This can be
3794    // implemented with two shuffles. First shuffle gather the elements.
3795    // The second shuffle, which takes the first shuffle as both of its
3796    // vector operands, put the elements into the right order.
3797    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
3798                     DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3799                                 &Mask1[0], Mask1.size()));
3800
3801    SmallVector<SDValue, 8> Mask2(4, DAG.getNode(ISD::UNDEF, MaskEVT));
3802    for (unsigned i = 0; i != 4; ++i) {
3803      if (Locs[i].first == -1)
3804        continue;
3805      else {
3806        unsigned Idx = (i < 2) ? 0 : 4;
3807        Idx += Locs[i].first * 2 + Locs[i].second;
3808        Mask2[i] = DAG.getConstant(Idx, MaskEVT);
3809      }
3810    }
3811
3812    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1,
3813                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3814                                   &Mask2[0], Mask2.size()));
3815  } else if (NumLo == 3 || NumHi == 3) {
3816    // Otherwise, we must have three elements from one vector, call it X, and
3817    // one element from the other, call it Y.  First, use a shufps to build an
3818    // intermediate vector with the one element from Y and the element from X
3819    // that will be in the same half in the final destination (the indexes don't
3820    // matter). Then, use a shufps to build the final vector, taking the half
3821    // containing the element from Y from the intermediate, and the other half
3822    // from X.
3823    if (NumHi == 3) {
3824      // Normalize it so the 3 elements come from V1.
3825      PermMask = CommuteVectorShuffleMask(PermMask, DAG);
3826      std::swap(V1, V2);
3827    }
3828
3829    // Find the element from V2.
3830    unsigned HiIndex;
3831    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
3832      SDValue Elt = PermMask.getOperand(HiIndex);
3833      if (Elt.getOpcode() == ISD::UNDEF)
3834        continue;
3835      unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue();
3836      if (Val >= 4)
3837        break;
3838    }
3839
3840    Mask1[0] = PermMask.getOperand(HiIndex);
3841    Mask1[1] = DAG.getNode(ISD::UNDEF, MaskEVT);
3842    Mask1[2] = PermMask.getOperand(HiIndex^1);
3843    Mask1[3] = DAG.getNode(ISD::UNDEF, MaskEVT);
3844    V2 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
3845                     DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4));
3846
3847    if (HiIndex >= 2) {
3848      Mask1[0] = PermMask.getOperand(0);
3849      Mask1[1] = PermMask.getOperand(1);
3850      Mask1[2] = DAG.getConstant(HiIndex & 1 ? 6 : 4, MaskEVT);
3851      Mask1[3] = DAG.getConstant(HiIndex & 1 ? 4 : 6, MaskEVT);
3852      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
3853                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4));
3854    } else {
3855      Mask1[0] = DAG.getConstant(HiIndex & 1 ? 2 : 0, MaskEVT);
3856      Mask1[1] = DAG.getConstant(HiIndex & 1 ? 0 : 2, MaskEVT);
3857      Mask1[2] = PermMask.getOperand(2);
3858      Mask1[3] = PermMask.getOperand(3);
3859      if (Mask1[2].getOpcode() != ISD::UNDEF)
3860        Mask1[2] =
3861          DAG.getConstant(cast<ConstantSDNode>(Mask1[2])->getZExtValue()+4,
3862                          MaskEVT);
3863      if (Mask1[3].getOpcode() != ISD::UNDEF)
3864        Mask1[3] =
3865          DAG.getConstant(cast<ConstantSDNode>(Mask1[3])->getZExtValue()+4,
3866                          MaskEVT);
3867      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V2, V1,
3868                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4));
3869    }
3870  }
3871
3872  // Break it into (shuffle shuffle_hi, shuffle_lo).
3873  Locs.clear();
3874  SmallVector<SDValue,8> LoMask(4, DAG.getNode(ISD::UNDEF, MaskEVT));
3875  SmallVector<SDValue,8> HiMask(4, DAG.getNode(ISD::UNDEF, MaskEVT));
3876  SmallVector<SDValue,8> *MaskPtr = &LoMask;
3877  unsigned MaskIdx = 0;
3878  unsigned LoIdx = 0;
3879  unsigned HiIdx = 2;
3880  for (unsigned i = 0; i != 4; ++i) {
3881    if (i == 2) {
3882      MaskPtr = &HiMask;
3883      MaskIdx = 1;
3884      LoIdx = 0;
3885      HiIdx = 2;
3886    }
3887    SDValue Elt = PermMask.getOperand(i);
3888    if (Elt.getOpcode() == ISD::UNDEF) {
3889      Locs[i] = std::make_pair(-1, -1);
3890    } else if (cast<ConstantSDNode>(Elt)->getZExtValue() < 4) {
3891      Locs[i] = std::make_pair(MaskIdx, LoIdx);
3892      (*MaskPtr)[LoIdx] = Elt;
3893      LoIdx++;
3894    } else {
3895      Locs[i] = std::make_pair(MaskIdx, HiIdx);
3896      (*MaskPtr)[HiIdx] = Elt;
3897      HiIdx++;
3898    }
3899  }
3900
3901  SDValue LoShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
3902                                    DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3903                                                &LoMask[0], LoMask.size()));
3904  SDValue HiShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
3905                                    DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3906                                                &HiMask[0], HiMask.size()));
3907  SmallVector<SDValue, 8> MaskOps;
3908  for (unsigned i = 0; i != 4; ++i) {
3909    if (Locs[i].first == -1) {
3910      MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
3911    } else {
3912      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
3913      MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
3914    }
3915  }
3916  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle,
3917                     DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
3918                                 &MaskOps[0], MaskOps.size()));
3919}
3920
3921SDValue
3922X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
3923  SDValue V1 = Op.getOperand(0);
3924  SDValue V2 = Op.getOperand(1);
3925  SDValue PermMask = Op.getOperand(2);
3926  MVT VT = Op.getValueType();
3927  unsigned NumElems = PermMask.getNumOperands();
3928  bool isMMX = VT.getSizeInBits() == 64;
3929  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
3930  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
3931  bool V1IsSplat = false;
3932  bool V2IsSplat = false;
3933
3934  if (isUndefShuffle(Op.getNode()))
3935    return DAG.getNode(ISD::UNDEF, VT);
3936
3937  if (isZeroShuffle(Op.getNode()))
3938    return getZeroVector(VT, Subtarget->hasSSE2(), DAG);
3939
3940  if (isIdentityMask(PermMask.getNode()))
3941    return V1;
3942  else if (isIdentityMask(PermMask.getNode(), true))
3943    return V2;
3944
3945  // Canonicalize movddup shuffles.
3946  if (V2IsUndef && Subtarget->hasSSE2() &&
3947      X86::isMOVDDUPMask(PermMask.getNode()))
3948    return CanonicalizeMovddup(Op, V1, PermMask, DAG, Subtarget->hasSSE3());
3949
3950  if (isSplatMask(PermMask.getNode())) {
3951    if (isMMX || NumElems < 4) return Op;
3952    // Promote it to a v4{if}32 splat.
3953    return PromoteSplat(Op, DAG, Subtarget->hasSSE2());
3954  }
3955
3956  // If the shuffle can be profitably rewritten as a narrower shuffle, then
3957  // do it!
3958  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
3959    SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
3960    if (NewOp.getNode())
3961      return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
3962  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
3963    // FIXME: Figure out a cleaner way to do this.
3964    // Try to make use of movq to zero out the top part.
3965    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
3966      SDValue NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
3967                                                 DAG, *this);
3968      if (NewOp.getNode()) {
3969        SDValue NewV1 = NewOp.getOperand(0);
3970        SDValue NewV2 = NewOp.getOperand(1);
3971        SDValue NewMask = NewOp.getOperand(2);
3972        if (isCommutedMOVL(NewMask.getNode(), true, false)) {
3973          NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
3974          return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget);
3975        }
3976      }
3977    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
3978      SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
3979                                                DAG, *this);
3980      if (NewOp.getNode() && X86::isMOVLMask(NewOp.getOperand(2).getNode()))
3981        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
3982                             DAG, Subtarget);
3983    }
3984  }
3985
3986  // Check if this can be converted into a logical shift.
3987  bool isLeft = false;
3988  unsigned ShAmt = 0;
3989  SDValue ShVal;
3990  bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt);
3991  if (isShift && ShVal.hasOneUse()) {
3992    // If the shifted value has multiple uses, it may be cheaper to use
3993    // v_set0 + movlhps or movhlps, etc.
3994    MVT EVT = VT.getVectorElementType();
3995    ShAmt *= EVT.getSizeInBits();
3996    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
3997  }
3998
3999  if (X86::isMOVLMask(PermMask.getNode())) {
4000    if (V1IsUndef)
4001      return V2;
4002    if (ISD::isBuildVectorAllZeros(V1.getNode()))
4003      return getVZextMovL(VT, VT, V2, DAG, Subtarget);
4004    if (!isMMX)
4005      return Op;
4006  }
4007
4008  if (!isMMX && (X86::isMOVSHDUPMask(PermMask.getNode()) ||
4009                 X86::isMOVSLDUPMask(PermMask.getNode()) ||
4010                 X86::isMOVHLPSMask(PermMask.getNode()) ||
4011                 X86::isMOVHPMask(PermMask.getNode()) ||
4012                 X86::isMOVLPMask(PermMask.getNode())))
4013    return Op;
4014
4015  if (ShouldXformToMOVHLPS(PermMask.getNode()) ||
4016      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), PermMask.getNode()))
4017    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4018
4019  if (isShift) {
4020    // No better options. Use a vshl / vsrl.
4021    MVT EVT = VT.getVectorElementType();
4022    ShAmt *= EVT.getSizeInBits();
4023    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this);
4024  }
4025
4026  bool Commuted = false;
4027  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4028  // 1,1,1,1 -> v8i16 though.
4029  V1IsSplat = isSplatVector(V1.getNode());
4030  V2IsSplat = isSplatVector(V2.getNode());
4031
4032  // Canonicalize the splat or undef, if present, to be on the RHS.
4033  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4034    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4035    std::swap(V1IsSplat, V2IsSplat);
4036    std::swap(V1IsUndef, V2IsUndef);
4037    Commuted = true;
4038  }
4039
4040  // FIXME: Figure out a cleaner way to do this.
4041  if (isCommutedMOVL(PermMask.getNode(), V2IsSplat, V2IsUndef)) {
4042    if (V2IsUndef) return V1;
4043    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4044    if (V2IsSplat) {
4045      // V2 is a splat, so the mask may be malformed. That is, it may point
4046      // to any V2 element. The instruction selectior won't like this. Get
4047      // a corrected mask and commute to form a proper MOVS{S|D}.
4048      SDValue NewMask = getMOVLMask(NumElems, DAG);
4049      if (NewMask.getNode() != PermMask.getNode())
4050        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
4051    }
4052    return Op;
4053  }
4054
4055  if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) ||
4056      X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) ||
4057      X86::isUNPCKLMask(PermMask.getNode()) ||
4058      X86::isUNPCKHMask(PermMask.getNode()))
4059    return Op;
4060
4061  if (V2IsSplat) {
4062    // Normalize mask so all entries that point to V2 points to its first
4063    // element then try to match unpck{h|l} again. If match, return a
4064    // new vector_shuffle with the corrected mask.
4065    SDValue NewMask = NormalizeMask(PermMask, DAG);
4066    if (NewMask.getNode() != PermMask.getNode()) {
4067      if (X86::isUNPCKLMask(PermMask.getNode(), true)) {
4068        SDValue NewMask = getUnpacklMask(NumElems, DAG);
4069        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
4070      } else if (X86::isUNPCKHMask(PermMask.getNode(), true)) {
4071        SDValue NewMask = getUnpackhMask(NumElems, DAG);
4072        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
4073      }
4074    }
4075  }
4076
4077  // Normalize the node to match x86 shuffle ops if needed
4078  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.getNode()))
4079      Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4080
4081  if (Commuted) {
4082    // Commute is back and try unpck* again.
4083    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
4084    if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) ||
4085        X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) ||
4086        X86::isUNPCKLMask(PermMask.getNode()) ||
4087        X86::isUNPCKHMask(PermMask.getNode()))
4088      return Op;
4089  }
4090
4091  // Try PSHUF* first, then SHUFP*.
4092  // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically
4093  // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
4094  if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.getNode())) {
4095    if (V2.getOpcode() != ISD::UNDEF)
4096      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
4097                         DAG.getNode(ISD::UNDEF, VT), PermMask);
4098    return Op;
4099  }
4100
4101  if (!isMMX) {
4102    if (Subtarget->hasSSE2() &&
4103        (X86::isPSHUFDMask(PermMask.getNode()) ||
4104         X86::isPSHUFHWMask(PermMask.getNode()) ||
4105         X86::isPSHUFLWMask(PermMask.getNode()))) {
4106      MVT RVT = VT;
4107      if (VT == MVT::v4f32) {
4108        RVT = MVT::v4i32;
4109        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT,
4110                         DAG.getNode(ISD::BIT_CONVERT, RVT, V1),
4111                         DAG.getNode(ISD::UNDEF, RVT), PermMask);
4112      } else if (V2.getOpcode() != ISD::UNDEF)
4113        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, V1,
4114                         DAG.getNode(ISD::UNDEF, RVT), PermMask);
4115      if (RVT != VT)
4116        Op = DAG.getNode(ISD::BIT_CONVERT, VT, Op);
4117      return Op;
4118    }
4119
4120    // Binary or unary shufps.
4121    if (X86::isSHUFPMask(PermMask.getNode()) ||
4122        (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.getNode())))
4123      return Op;
4124  }
4125
4126  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4127  if (VT == MVT::v8i16) {
4128    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this);
4129    if (NewOp.getNode())
4130      return NewOp;
4131  }
4132
4133  // Handle all 4 wide cases with a number of shuffles except for MMX.
4134  if (NumElems == 4 && !isMMX)
4135    return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG);
4136
4137  return SDValue();
4138}
4139
4140SDValue
4141X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4142                                                SelectionDAG &DAG) {
4143  MVT VT = Op.getValueType();
4144  if (VT.getSizeInBits() == 8) {
4145    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32,
4146                                    Op.getOperand(0), Op.getOperand(1));
4147    SDValue Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
4148                                    DAG.getValueType(VT));
4149    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
4150  } else if (VT.getSizeInBits() == 16) {
4151    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32,
4152                                    Op.getOperand(0), Op.getOperand(1));
4153    SDValue Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
4154                                    DAG.getValueType(VT));
4155    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
4156  } else if (VT == MVT::f32) {
4157    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4158    // the result back to FR32 register. It's only worth matching if the
4159    // result has a single use which is a store or a bitcast to i32.
4160    if (!Op.hasOneUse())
4161      return SDValue();
4162    SDNode *User = *Op.getNode()->use_begin();
4163    if (User->getOpcode() != ISD::STORE &&
4164        (User->getOpcode() != ISD::BIT_CONVERT ||
4165         User->getValueType(0) != MVT::i32))
4166      return SDValue();
4167    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
4168                    DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Op.getOperand(0)),
4169                                    Op.getOperand(1));
4170    return DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Extract);
4171  }
4172  return SDValue();
4173}
4174
4175
4176SDValue
4177X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4178  if (!isa<ConstantSDNode>(Op.getOperand(1)))
4179    return SDValue();
4180
4181  if (Subtarget->hasSSE41()) {
4182    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4183    if (Res.getNode())
4184      return Res;
4185  }
4186
4187  MVT VT = Op.getValueType();
4188  // TODO: handle v16i8.
4189  if (VT.getSizeInBits() == 16) {
4190    SDValue Vec = Op.getOperand(0);
4191    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4192    if (Idx == 0)
4193      return DAG.getNode(ISD::TRUNCATE, MVT::i16,
4194                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
4195                                 DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Vec),
4196                                     Op.getOperand(1)));
4197    // Transform it so it match pextrw which produces a 32-bit result.
4198    MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
4199    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
4200                                    Op.getOperand(0), Op.getOperand(1));
4201    SDValue Assert  = DAG.getNode(ISD::AssertZext, EVT, Extract,
4202                                    DAG.getValueType(VT));
4203    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
4204  } else if (VT.getSizeInBits() == 32) {
4205    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4206    if (Idx == 0)
4207      return Op;
4208    // SHUFPS the element to the lowest double word, then movss.
4209    MVT MaskVT = MVT::getIntVectorWithNumElements(4);
4210    SmallVector<SDValue, 8> IdxVec;
4211    IdxVec.
4212      push_back(DAG.getConstant(Idx, MaskVT.getVectorElementType()));
4213    IdxVec.
4214      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
4215    IdxVec.
4216      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
4217    IdxVec.
4218      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
4219    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
4220                                 &IdxVec[0], IdxVec.size());
4221    SDValue Vec = Op.getOperand(0);
4222    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
4223                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
4224    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
4225                       DAG.getIntPtrConstant(0));
4226  } else if (VT.getSizeInBits() == 64) {
4227    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4228    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4229    //        to match extract_elt for f64.
4230    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4231    if (Idx == 0)
4232      return Op;
4233
4234    // UNPCKHPD the element to the lowest double word, then movsd.
4235    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4236    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4237    MVT MaskVT = MVT::getIntVectorWithNumElements(2);
4238    SmallVector<SDValue, 8> IdxVec;
4239    IdxVec.push_back(DAG.getConstant(1, MaskVT.getVectorElementType()));
4240    IdxVec.
4241      push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType()));
4242    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
4243                                 &IdxVec[0], IdxVec.size());
4244    SDValue Vec = Op.getOperand(0);
4245    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
4246                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
4247    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
4248                       DAG.getIntPtrConstant(0));
4249  }
4250
4251  return SDValue();
4252}
4253
4254SDValue
4255X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4256  MVT VT = Op.getValueType();
4257  MVT EVT = VT.getVectorElementType();
4258
4259  SDValue N0 = Op.getOperand(0);
4260  SDValue N1 = Op.getOperand(1);
4261  SDValue N2 = Op.getOperand(2);
4262
4263  if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
4264      isa<ConstantSDNode>(N2)) {
4265    unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
4266                                                  : X86ISD::PINSRW;
4267    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4268    // argument.
4269    if (N1.getValueType() != MVT::i32)
4270      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
4271    if (N2.getValueType() != MVT::i32)
4272      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4273    return DAG.getNode(Opc, VT, N0, N1, N2);
4274  } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4275    // Bits [7:6] of the constant are the source select.  This will always be
4276    //  zero here.  The DAG Combiner may combine an extract_elt index into these
4277    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
4278    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
4279    // Bits [5:4] of the constant are the destination select.  This is the
4280    //  value of the incoming immediate.
4281    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
4282    //   combine either bitwise AND or insert of float 0.0 to set these bits.
4283    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4284    return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2);
4285  }
4286  return SDValue();
4287}
4288
4289SDValue
4290X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4291  MVT VT = Op.getValueType();
4292  MVT EVT = VT.getVectorElementType();
4293
4294  if (Subtarget->hasSSE41())
4295    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4296
4297  if (EVT == MVT::i8)
4298    return SDValue();
4299
4300  SDValue N0 = Op.getOperand(0);
4301  SDValue N1 = Op.getOperand(1);
4302  SDValue N2 = Op.getOperand(2);
4303
4304  if (EVT.getSizeInBits() == 16) {
4305    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
4306    // as its second argument.
4307    if (N1.getValueType() != MVT::i32)
4308      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
4309    if (N2.getValueType() != MVT::i32)
4310      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4311    return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
4312  }
4313  return SDValue();
4314}
4315
4316SDValue
4317X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
4318  if (Op.getValueType() == MVT::v2f32)
4319    return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f32,
4320                       DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2i32,
4321                                   DAG.getNode(ISD::BIT_CONVERT, MVT::i32,
4322                                               Op.getOperand(0))));
4323
4324  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
4325  MVT VT = MVT::v2i32;
4326  switch (Op.getValueType().getSimpleVT()) {
4327  default: break;
4328  case MVT::v16i8:
4329  case MVT::v8i16:
4330    VT = MVT::v4i32;
4331    break;
4332  }
4333  return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
4334                     DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, AnyExt));
4335}
4336
4337// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
4338// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
4339// one of the above mentioned nodes. It has to be wrapped because otherwise
4340// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
4341// be used to form addressing mode. These wrapped nodes will be selected
4342// into MOV32ri.
4343SDValue
4344X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
4345  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4346  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(),
4347                                               getPointerTy(),
4348                                               CP->getAlignment());
4349  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
4350  // With PIC, the address is actually $g + Offset.
4351  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4352      !Subtarget->isPICStyleRIPRel()) {
4353    Result = DAG.getNode(ISD::ADD, getPointerTy(),
4354                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
4355                         Result);
4356  }
4357
4358  return Result;
4359}
4360
4361SDValue
4362X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
4363                                      SelectionDAG &DAG) const {
4364  SDValue Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
4365  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
4366  // With PIC, the address is actually $g + Offset.
4367  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4368      !Subtarget->isPICStyleRIPRel()) {
4369    Result = DAG.getNode(ISD::ADD, getPointerTy(),
4370                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
4371                         Result);
4372  }
4373
4374  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
4375  // load the value at address GV, not the value of GV itself. This means that
4376  // the GlobalAddress must be in the base or index register of the address, not
4377  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
4378  // The same applies for external symbols during PIC codegen
4379  if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false))
4380    Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result,
4381                         PseudoSourceValue::getGOT(), 0);
4382
4383  return Result;
4384}
4385
4386SDValue
4387X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
4388  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4389  return LowerGlobalAddress(GV, DAG);
4390}
4391
4392// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
4393static SDValue
4394LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4395                                const MVT PtrVT) {
4396  SDValue InFlag;
4397  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX,
4398                                     DAG.getNode(X86ISD::GlobalBaseReg,
4399                                                 PtrVT), InFlag);
4400  InFlag = Chain.getValue(1);
4401
4402  // emit leal symbol@TLSGD(,%ebx,1), %eax
4403  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
4404  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4405                                             GA->getValueType(0),
4406                                             GA->getOffset());
4407  SDValue Ops[] = { Chain,  TGA, InFlag };
4408  SDValue Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3);
4409  InFlag = Result.getValue(2);
4410  Chain = Result.getValue(1);
4411
4412  // call ___tls_get_addr. This function receives its argument in
4413  // the register EAX.
4414  Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag);
4415  InFlag = Chain.getValue(1);
4416
4417  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4418  SDValue Ops1[] = { Chain,
4419                      DAG.getTargetExternalSymbol("___tls_get_addr",
4420                                                  PtrVT),
4421                      DAG.getRegister(X86::EAX, PtrVT),
4422                      DAG.getRegister(X86::EBX, PtrVT),
4423                      InFlag };
4424  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5);
4425  InFlag = Chain.getValue(1);
4426
4427  return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag);
4428}
4429
4430// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
4431static SDValue
4432LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4433                                const MVT PtrVT) {
4434  SDValue InFlag, Chain;
4435
4436  // emit leaq symbol@TLSGD(%rip), %rdi
4437  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
4438  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4439                                             GA->getValueType(0),
4440                                             GA->getOffset());
4441  SDValue Ops[]  = { DAG.getEntryNode(), TGA};
4442  SDValue Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 2);
4443  Chain  = Result.getValue(1);
4444  InFlag = Result.getValue(2);
4445
4446  // call __tls_get_addr. This function receives its argument in
4447  // the register RDI.
4448  Chain = DAG.getCopyToReg(Chain, X86::RDI, Result, InFlag);
4449  InFlag = Chain.getValue(1);
4450
4451  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4452  SDValue Ops1[] = { Chain,
4453                      DAG.getTargetExternalSymbol("__tls_get_addr",
4454                                                  PtrVT),
4455                      DAG.getRegister(X86::RDI, PtrVT),
4456                      InFlag };
4457  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 4);
4458  InFlag = Chain.getValue(1);
4459
4460  return DAG.getCopyFromReg(Chain, X86::RAX, PtrVT, InFlag);
4461}
4462
4463// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
4464// "local exec" model.
4465static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4466                                     const MVT PtrVT) {
4467  // Get the Thread Pointer
4468  SDValue ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT);
4469  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
4470  // exec)
4471  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4472                                             GA->getValueType(0),
4473                                             GA->getOffset());
4474  SDValue Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA);
4475
4476  if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
4477    Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset,
4478                         PseudoSourceValue::getGOT(), 0);
4479
4480  // The address of the thread local variable is the add of the thread
4481  // pointer with the offset of the variable.
4482  return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset);
4483}
4484
4485SDValue
4486X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
4487  // TODO: implement the "local dynamic" model
4488  // TODO: implement the "initial exec"model for pic executables
4489  assert(Subtarget->isTargetELF() &&
4490         "TLS not implemented for non-ELF targets");
4491  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
4492  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
4493  // otherwise use the "Local Exec"TLS Model
4494  if (Subtarget->is64Bit()) {
4495    return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
4496  } else {
4497    if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
4498      return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
4499    else
4500      return LowerToTLSExecModel(GA, DAG, getPointerTy());
4501  }
4502}
4503
4504SDValue
4505X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
4506  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
4507  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
4508  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
4509  // With PIC, the address is actually $g + Offset.
4510  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4511      !Subtarget->isPICStyleRIPRel()) {
4512    Result = DAG.getNode(ISD::ADD, getPointerTy(),
4513                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
4514                         Result);
4515  }
4516
4517  return Result;
4518}
4519
4520SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
4521  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4522  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
4523  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
4524  // With PIC, the address is actually $g + Offset.
4525  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4526      !Subtarget->isPICStyleRIPRel()) {
4527    Result = DAG.getNode(ISD::ADD, getPointerTy(),
4528                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
4529                         Result);
4530  }
4531
4532  return Result;
4533}
4534
4535/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
4536/// take a 2 x i32 value to shift plus a shift amount.
4537SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
4538  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4539  MVT VT = Op.getValueType();
4540  unsigned VTBits = VT.getSizeInBits();
4541  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
4542  SDValue ShOpLo = Op.getOperand(0);
4543  SDValue ShOpHi = Op.getOperand(1);
4544  SDValue ShAmt  = Op.getOperand(2);
4545  SDValue Tmp1 = isSRA ?
4546    DAG.getNode(ISD::SRA, VT, ShOpHi, DAG.getConstant(VTBits - 1, MVT::i8)) :
4547    DAG.getConstant(0, VT);
4548
4549  SDValue Tmp2, Tmp3;
4550  if (Op.getOpcode() == ISD::SHL_PARTS) {
4551    Tmp2 = DAG.getNode(X86ISD::SHLD, VT, ShOpHi, ShOpLo, ShAmt);
4552    Tmp3 = DAG.getNode(ISD::SHL, VT, ShOpLo, ShAmt);
4553  } else {
4554    Tmp2 = DAG.getNode(X86ISD::SHRD, VT, ShOpLo, ShOpHi, ShAmt);
4555    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, VT, ShOpHi, ShAmt);
4556  }
4557
4558  SDValue AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt,
4559                                  DAG.getConstant(VTBits, MVT::i8));
4560  SDValue Cond = DAG.getNode(X86ISD::CMP, VT,
4561                               AndNode, DAG.getConstant(0, MVT::i8));
4562
4563  SDValue Hi, Lo;
4564  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
4565  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
4566  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
4567
4568  if (Op.getOpcode() == ISD::SHL_PARTS) {
4569    Hi = DAG.getNode(X86ISD::CMOV, VT, Ops0, 4);
4570    Lo = DAG.getNode(X86ISD::CMOV, VT, Ops1, 4);
4571  } else {
4572    Lo = DAG.getNode(X86ISD::CMOV, VT, Ops0, 4);
4573    Hi = DAG.getNode(X86ISD::CMOV, VT, Ops1, 4);
4574  }
4575
4576  SDValue Ops[2] = { Lo, Hi };
4577  return DAG.getMergeValues(Ops, 2);
4578}
4579
4580SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4581  MVT SrcVT = Op.getOperand(0).getValueType();
4582  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
4583         "Unknown SINT_TO_FP to lower!");
4584
4585  // These are really Legal; caller falls through into that case.
4586  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
4587    return SDValue();
4588  if (SrcVT == MVT::i64 && Op.getValueType() != MVT::f80 &&
4589      Subtarget->is64Bit())
4590    return SDValue();
4591
4592  unsigned Size = SrcVT.getSizeInBits()/8;
4593  MachineFunction &MF = DAG.getMachineFunction();
4594  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
4595  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4596  SDValue Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0),
4597                                 StackSlot,
4598                                 PseudoSourceValue::getFixedStack(SSFI), 0);
4599
4600  // Build the FILD
4601  SDVTList Tys;
4602  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
4603  if (useSSE)
4604    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
4605  else
4606    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
4607  SmallVector<SDValue, 8> Ops;
4608  Ops.push_back(Chain);
4609  Ops.push_back(StackSlot);
4610  Ops.push_back(DAG.getValueType(SrcVT));
4611  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD,
4612                                 Tys, &Ops[0], Ops.size());
4613
4614  if (useSSE) {
4615    Chain = Result.getValue(1);
4616    SDValue InFlag = Result.getValue(2);
4617
4618    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
4619    // shouldn't be necessary except that RFP cannot be live across
4620    // multiple blocks. When stackifier is fixed, they can be uncoupled.
4621    MachineFunction &MF = DAG.getMachineFunction();
4622    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
4623    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4624    Tys = DAG.getVTList(MVT::Other);
4625    SmallVector<SDValue, 8> Ops;
4626    Ops.push_back(Chain);
4627    Ops.push_back(Result);
4628    Ops.push_back(StackSlot);
4629    Ops.push_back(DAG.getValueType(Op.getValueType()));
4630    Ops.push_back(InFlag);
4631    Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size());
4632    Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot,
4633                         PseudoSourceValue::getFixedStack(SSFI), 0);
4634  }
4635
4636  return Result;
4637}
4638
4639std::pair<SDValue,SDValue> X86TargetLowering::
4640FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) {
4641  assert(Op.getValueType().getSimpleVT() <= MVT::i64 &&
4642         Op.getValueType().getSimpleVT() >= MVT::i16 &&
4643         "Unknown FP_TO_SINT to lower!");
4644
4645  // These are really Legal.
4646  if (Op.getValueType() == MVT::i32 &&
4647      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
4648    return std::make_pair(SDValue(), SDValue());
4649  if (Subtarget->is64Bit() &&
4650      Op.getValueType() == MVT::i64 &&
4651      Op.getOperand(0).getValueType() != MVT::f80)
4652    return std::make_pair(SDValue(), SDValue());
4653
4654  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
4655  // stack slot.
4656  MachineFunction &MF = DAG.getMachineFunction();
4657  unsigned MemSize = Op.getValueType().getSizeInBits()/8;
4658  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
4659  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4660  unsigned Opc;
4661  switch (Op.getValueType().getSimpleVT()) {
4662  default: assert(0 && "Invalid FP_TO_SINT to lower!");
4663  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
4664  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
4665  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
4666  }
4667
4668  SDValue Chain = DAG.getEntryNode();
4669  SDValue Value = Op.getOperand(0);
4670  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
4671    assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
4672    Chain = DAG.getStore(Chain, Value, StackSlot,
4673                         PseudoSourceValue::getFixedStack(SSFI), 0);
4674    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
4675    SDValue Ops[] = {
4676      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
4677    };
4678    Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
4679    Chain = Value.getValue(1);
4680    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
4681    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4682  }
4683
4684  // Build the FP_TO_INT*_IN_MEM
4685  SDValue Ops[] = { Chain, Value, StackSlot };
4686  SDValue FIST = DAG.getNode(Opc, MVT::Other, Ops, 3);
4687
4688  return std::make_pair(FIST, StackSlot);
4689}
4690
4691SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
4692  std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(Op, DAG);
4693  SDValue FIST = Vals.first, StackSlot = Vals.second;
4694  if (FIST.getNode() == 0) return SDValue();
4695
4696  // Load the result.
4697  return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0);
4698}
4699
4700SDNode *X86TargetLowering::ExpandFP_TO_SINT(SDNode *N, SelectionDAG &DAG) {
4701  std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG);
4702  SDValue FIST = Vals.first, StackSlot = Vals.second;
4703  if (FIST.getNode() == 0) return 0;
4704
4705  MVT VT = N->getValueType(0);
4706
4707  // Return a load from the stack slot.
4708  SDValue Res = DAG.getLoad(VT, FIST, StackSlot, NULL, 0);
4709
4710  // Use MERGE_VALUES to drop the chain result value and get a node with one
4711  // result.  This requires turning off getMergeValues simplification, since
4712  // otherwise it will give us Res back.
4713  return DAG.getMergeValues(&Res, 1, false).getNode();
4714}
4715
4716SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
4717  MVT VT = Op.getValueType();
4718  MVT EltVT = VT;
4719  if (VT.isVector())
4720    EltVT = VT.getVectorElementType();
4721  std::vector<Constant*> CV;
4722  if (EltVT == MVT::f64) {
4723    Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))));
4724    CV.push_back(C);
4725    CV.push_back(C);
4726  } else {
4727    Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31))));
4728    CV.push_back(C);
4729    CV.push_back(C);
4730    CV.push_back(C);
4731    CV.push_back(C);
4732  }
4733  Constant *C = ConstantVector::get(CV);
4734  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
4735  SDValue Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx,
4736                               PseudoSourceValue::getConstantPool(), 0,
4737                               false, 16);
4738  return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask);
4739}
4740
4741SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
4742  MVT VT = Op.getValueType();
4743  MVT EltVT = VT;
4744  unsigned EltNum = 1;
4745  if (VT.isVector()) {
4746    EltVT = VT.getVectorElementType();
4747    EltNum = VT.getVectorNumElements();
4748  }
4749  std::vector<Constant*> CV;
4750  if (EltVT == MVT::f64) {
4751    Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63)));
4752    CV.push_back(C);
4753    CV.push_back(C);
4754  } else {
4755    Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31)));
4756    CV.push_back(C);
4757    CV.push_back(C);
4758    CV.push_back(C);
4759    CV.push_back(C);
4760  }
4761  Constant *C = ConstantVector::get(CV);
4762  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
4763  SDValue Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx,
4764                               PseudoSourceValue::getConstantPool(), 0,
4765                               false, 16);
4766  if (VT.isVector()) {
4767    return DAG.getNode(ISD::BIT_CONVERT, VT,
4768                       DAG.getNode(ISD::XOR, MVT::v2i64,
4769                    DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Op.getOperand(0)),
4770                    DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Mask)));
4771  } else {
4772    return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask);
4773  }
4774}
4775
4776SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
4777  SDValue Op0 = Op.getOperand(0);
4778  SDValue Op1 = Op.getOperand(1);
4779  MVT VT = Op.getValueType();
4780  MVT SrcVT = Op1.getValueType();
4781
4782  // If second operand is smaller, extend it first.
4783  if (SrcVT.bitsLT(VT)) {
4784    Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1);
4785    SrcVT = VT;
4786  }
4787  // And if it is bigger, shrink it first.
4788  if (SrcVT.bitsGT(VT)) {
4789    Op1 = DAG.getNode(ISD::FP_ROUND, VT, Op1, DAG.getIntPtrConstant(1));
4790    SrcVT = VT;
4791  }
4792
4793  // At this point the operands and the result should have the same
4794  // type, and that won't be f80 since that is not custom lowered.
4795
4796  // First get the sign bit of second operand.
4797  std::vector<Constant*> CV;
4798  if (SrcVT == MVT::f64) {
4799    CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63))));
4800    CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
4801  } else {
4802    CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31))));
4803    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
4804    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
4805    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
4806  }
4807  Constant *C = ConstantVector::get(CV);
4808  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
4809  SDValue Mask1 = DAG.getLoad(SrcVT, DAG.getEntryNode(), CPIdx,
4810                                PseudoSourceValue::getConstantPool(), 0,
4811                                false, 16);
4812  SDValue SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1);
4813
4814  // Shift sign bit right or left if the two operands have different types.
4815  if (SrcVT.bitsGT(VT)) {
4816    // Op0 is MVT::f32, Op1 is MVT::f64.
4817    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit);
4818    SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit,
4819                          DAG.getConstant(32, MVT::i32));
4820    SignBit = DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32, SignBit);
4821    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f32, SignBit,
4822                          DAG.getIntPtrConstant(0));
4823  }
4824
4825  // Clear first operand sign bit.
4826  CV.clear();
4827  if (VT == MVT::f64) {
4828    CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))));
4829    CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
4830  } else {
4831    CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))));
4832    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
4833    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
4834    CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
4835  }
4836  C = ConstantVector::get(CV);
4837  CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
4838  SDValue Mask2 = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx,
4839                                PseudoSourceValue::getConstantPool(), 0,
4840                                false, 16);
4841  SDValue Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2);
4842
4843  // Or the value with the sign bit.
4844  return DAG.getNode(X86ISD::FOR, VT, Val, SignBit);
4845}
4846
4847SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
4848  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
4849  SDValue Cond;
4850  SDValue Op0 = Op.getOperand(0);
4851  SDValue Op1 = Op.getOperand(1);
4852  SDValue CC = Op.getOperand(2);
4853  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
4854  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
4855  unsigned X86CC;
4856
4857  if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
4858                     Op0, Op1, DAG)) {
4859    Cond = DAG.getNode(X86ISD::CMP, MVT::i32, Op0, Op1);
4860    return DAG.getNode(X86ISD::SETCC, MVT::i8,
4861                       DAG.getConstant(X86CC, MVT::i8), Cond);
4862  }
4863
4864  assert(isFP && "Illegal integer SetCC!");
4865
4866  Cond = DAG.getNode(X86ISD::CMP, MVT::i32, Op0, Op1);
4867  switch (SetCCOpcode) {
4868  default: assert(false && "Illegal floating point SetCC!");
4869  case ISD::SETOEQ: {  // !PF & ZF
4870    SDValue Tmp1 = DAG.getNode(X86ISD::SETCC, MVT::i8,
4871                                 DAG.getConstant(X86::COND_NP, MVT::i8), Cond);
4872    SDValue Tmp2 = DAG.getNode(X86ISD::SETCC, MVT::i8,
4873                                 DAG.getConstant(X86::COND_E, MVT::i8), Cond);
4874    return DAG.getNode(ISD::AND, MVT::i8, Tmp1, Tmp2);
4875  }
4876  case ISD::SETUNE: {  // PF | !ZF
4877    SDValue Tmp1 = DAG.getNode(X86ISD::SETCC, MVT::i8,
4878                                 DAG.getConstant(X86::COND_P, MVT::i8), Cond);
4879    SDValue Tmp2 = DAG.getNode(X86ISD::SETCC, MVT::i8,
4880                                 DAG.getConstant(X86::COND_NE, MVT::i8), Cond);
4881    return DAG.getNode(ISD::OR, MVT::i8, Tmp1, Tmp2);
4882  }
4883  }
4884}
4885
4886SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
4887  SDValue Cond;
4888  SDValue Op0 = Op.getOperand(0);
4889  SDValue Op1 = Op.getOperand(1);
4890  SDValue CC = Op.getOperand(2);
4891  MVT VT = Op.getValueType();
4892  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
4893  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
4894
4895  if (isFP) {
4896    unsigned SSECC = 8;
4897    MVT VT0 = Op0.getValueType();
4898    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
4899    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
4900    bool Swap = false;
4901
4902    switch (SetCCOpcode) {
4903    default: break;
4904    case ISD::SETOEQ:
4905    case ISD::SETEQ:  SSECC = 0; break;
4906    case ISD::SETOGT:
4907    case ISD::SETGT: Swap = true; // Fallthrough
4908    case ISD::SETLT:
4909    case ISD::SETOLT: SSECC = 1; break;
4910    case ISD::SETOGE:
4911    case ISD::SETGE: Swap = true; // Fallthrough
4912    case ISD::SETLE:
4913    case ISD::SETOLE: SSECC = 2; break;
4914    case ISD::SETUO:  SSECC = 3; break;
4915    case ISD::SETUNE:
4916    case ISD::SETNE:  SSECC = 4; break;
4917    case ISD::SETULE: Swap = true;
4918    case ISD::SETUGE: SSECC = 5; break;
4919    case ISD::SETULT: Swap = true;
4920    case ISD::SETUGT: SSECC = 6; break;
4921    case ISD::SETO:   SSECC = 7; break;
4922    }
4923    if (Swap)
4924      std::swap(Op0, Op1);
4925
4926    // In the two special cases we can't handle, emit two comparisons.
4927    if (SSECC == 8) {
4928      if (SetCCOpcode == ISD::SETUEQ) {
4929        SDValue UNORD, EQ;
4930        UNORD = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
4931        EQ = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
4932        return DAG.getNode(ISD::OR, VT, UNORD, EQ);
4933      }
4934      else if (SetCCOpcode == ISD::SETONE) {
4935        SDValue ORD, NEQ;
4936        ORD = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
4937        NEQ = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
4938        return DAG.getNode(ISD::AND, VT, ORD, NEQ);
4939      }
4940      assert(0 && "Illegal FP comparison");
4941    }
4942    // Handle all other FP comparisons here.
4943    return DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
4944  }
4945
4946  // We are handling one of the integer comparisons here.  Since SSE only has
4947  // GT and EQ comparisons for integer, swapping operands and multiple
4948  // operations may be required for some comparisons.
4949  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
4950  bool Swap = false, Invert = false, FlipSigns = false;
4951
4952  switch (VT.getSimpleVT()) {
4953  default: break;
4954  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
4955  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
4956  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
4957  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
4958  }
4959
4960  switch (SetCCOpcode) {
4961  default: break;
4962  case ISD::SETNE:  Invert = true;
4963  case ISD::SETEQ:  Opc = EQOpc; break;
4964  case ISD::SETLT:  Swap = true;
4965  case ISD::SETGT:  Opc = GTOpc; break;
4966  case ISD::SETGE:  Swap = true;
4967  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
4968  case ISD::SETULT: Swap = true;
4969  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
4970  case ISD::SETUGE: Swap = true;
4971  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
4972  }
4973  if (Swap)
4974    std::swap(Op0, Op1);
4975
4976  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
4977  // bits of the inputs before performing those operations.
4978  if (FlipSigns) {
4979    MVT EltVT = VT.getVectorElementType();
4980    SDValue SignBit = DAG.getConstant(EltVT.getIntegerVTSignBit(), EltVT);
4981    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
4982    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, VT, &SignBits[0],
4983                                    SignBits.size());
4984    Op0 = DAG.getNode(ISD::XOR, VT, Op0, SignVec);
4985    Op1 = DAG.getNode(ISD::XOR, VT, Op1, SignVec);
4986  }
4987
4988  SDValue Result = DAG.getNode(Opc, VT, Op0, Op1);
4989
4990  // If the logical-not of the result is required, perform that now.
4991  if (Invert) {
4992    MVT EltVT = VT.getVectorElementType();
4993    SDValue NegOne = DAG.getConstant(EltVT.getIntegerVTBitMask(), EltVT);
4994    std::vector<SDValue> NegOnes(VT.getVectorNumElements(), NegOne);
4995    SDValue NegOneV = DAG.getNode(ISD::BUILD_VECTOR, VT, &NegOnes[0],
4996                                    NegOnes.size());
4997    Result = DAG.getNode(ISD::XOR, VT, Result, NegOneV);
4998  }
4999  return Result;
5000}
5001
5002SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
5003  bool addTest = true;
5004  SDValue Cond  = Op.getOperand(0);
5005  SDValue CC;
5006
5007  if (Cond.getOpcode() == ISD::SETCC)
5008    Cond = LowerSETCC(Cond, DAG);
5009
5010  // If condition flag is set by a X86ISD::CMP, then use it as the condition
5011  // setting operand in place of the X86ISD::SETCC.
5012  if (Cond.getOpcode() == X86ISD::SETCC) {
5013    CC = Cond.getOperand(0);
5014
5015    SDValue Cmp = Cond.getOperand(1);
5016    unsigned Opc = Cmp.getOpcode();
5017    MVT VT = Op.getValueType();
5018
5019    bool IllegalFPCMov = false;
5020    if (VT.isFloatingPoint() && !VT.isVector() &&
5021        !isScalarFPTypeInSSEReg(VT))  // FPStack?
5022      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
5023
5024    if ((Opc == X86ISD::CMP ||
5025         Opc == X86ISD::COMI ||
5026         Opc == X86ISD::UCOMI) && !IllegalFPCMov) {
5027      Cond = Cmp;
5028      addTest = false;
5029    }
5030  }
5031
5032  if (addTest) {
5033    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5034    Cond= DAG.getNode(X86ISD::CMP, MVT::i32, Cond, DAG.getConstant(0, MVT::i8));
5035  }
5036
5037  const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(),
5038                                                    MVT::Flag);
5039  SmallVector<SDValue, 4> Ops;
5040  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
5041  // condition is true.
5042  Ops.push_back(Op.getOperand(2));
5043  Ops.push_back(Op.getOperand(1));
5044  Ops.push_back(CC);
5045  Ops.push_back(Cond);
5046  return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
5047}
5048
5049SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5050  bool addTest = true;
5051  SDValue Chain = Op.getOperand(0);
5052  SDValue Cond  = Op.getOperand(1);
5053  SDValue Dest  = Op.getOperand(2);
5054  SDValue CC;
5055
5056  if (Cond.getOpcode() == ISD::SETCC)
5057    Cond = LowerSETCC(Cond, DAG);
5058
5059  // If condition flag is set by a X86ISD::CMP, then use it as the condition
5060  // setting operand in place of the X86ISD::SETCC.
5061  if (Cond.getOpcode() == X86ISD::SETCC) {
5062    CC = Cond.getOperand(0);
5063
5064    SDValue Cmp = Cond.getOperand(1);
5065    unsigned Opc = Cmp.getOpcode();
5066    if (Opc == X86ISD::CMP ||
5067        Opc == X86ISD::COMI ||
5068        Opc == X86ISD::UCOMI) {
5069      Cond = Cmp;
5070      addTest = false;
5071    }
5072  }
5073
5074  if (addTest) {
5075    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5076    Cond= DAG.getNode(X86ISD::CMP, MVT::i32, Cond, DAG.getConstant(0, MVT::i8));
5077  }
5078  return DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
5079                     Chain, Op.getOperand(2), CC, Cond);
5080}
5081
5082
5083// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
5084// Calls to _alloca is needed to probe the stack when allocating more than 4k
5085// bytes in one go. Touching the stack at 4K increments is necessary to ensure
5086// that the guard pages used by the OS virtual memory manager are allocated in
5087// correct sequence.
5088SDValue
5089X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
5090                                           SelectionDAG &DAG) {
5091  assert(Subtarget->isTargetCygMing() &&
5092         "This should be used only on Cygwin/Mingw targets");
5093
5094  // Get the inputs.
5095  SDValue Chain = Op.getOperand(0);
5096  SDValue Size  = Op.getOperand(1);
5097  // FIXME: Ensure alignment here
5098
5099  SDValue Flag;
5100
5101  MVT IntPtr = getPointerTy();
5102  MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
5103
5104  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0));
5105
5106  Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag);
5107  Flag = Chain.getValue(1);
5108
5109  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5110  SDValue Ops[] = { Chain,
5111                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
5112                      DAG.getRegister(X86::EAX, IntPtr),
5113                      DAG.getRegister(X86StackPtr, SPTy),
5114                      Flag };
5115  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 5);
5116  Flag = Chain.getValue(1);
5117
5118  Chain = DAG.getCALLSEQ_END(Chain,
5119                             DAG.getIntPtrConstant(0),
5120                             DAG.getIntPtrConstant(0),
5121                             Flag);
5122
5123  Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1);
5124
5125  SDValue Ops1[2] = { Chain.getValue(0), Chain };
5126  return DAG.getMergeValues(Ops1, 2);
5127}
5128
5129SDValue
5130X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
5131                                        SDValue Chain,
5132                                        SDValue Dst, SDValue Src,
5133                                        SDValue Size, unsigned Align,
5134                                        const Value *DstSV, uint64_t DstSVOff) {
5135  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5136
5137  /// If not DWORD aligned or size is more than the threshold, call the library.
5138  /// The libc version is likely to be faster for these cases. It can use the
5139  /// address value and run time information about the CPU.
5140  if ((Align & 3) != 0 ||
5141      !ConstantSize ||
5142      ConstantSize->getZExtValue() >
5143        getSubtarget()->getMaxInlineSizeThreshold()) {
5144    SDValue InFlag(0, 0);
5145
5146    // Check to see if there is a specialized entry-point for memory zeroing.
5147    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
5148    if (const char *bzeroEntry =
5149          V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
5150      MVT IntPtr = getPointerTy();
5151      const Type *IntPtrTy = TD->getIntPtrType();
5152      TargetLowering::ArgListTy Args;
5153      TargetLowering::ArgListEntry Entry;
5154      Entry.Node = Dst;
5155      Entry.Ty = IntPtrTy;
5156      Args.push_back(Entry);
5157      Entry.Node = Size;
5158      Args.push_back(Entry);
5159      std::pair<SDValue,SDValue> CallResult =
5160        LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
5161                    CallingConv::C, false,
5162                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG);
5163      return CallResult.second;
5164    }
5165
5166    // Otherwise have the target-independent code call memset.
5167    return SDValue();
5168  }
5169
5170  uint64_t SizeVal = ConstantSize->getZExtValue();
5171  SDValue InFlag(0, 0);
5172  MVT AVT;
5173  SDValue Count;
5174  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
5175  unsigned BytesLeft = 0;
5176  bool TwoRepStos = false;
5177  if (ValC) {
5178    unsigned ValReg;
5179    uint64_t Val = ValC->getZExtValue() & 255;
5180
5181    // If the value is a constant, then we can potentially use larger sets.
5182    switch (Align & 3) {
5183    case 2:   // WORD aligned
5184      AVT = MVT::i16;
5185      ValReg = X86::AX;
5186      Val = (Val << 8) | Val;
5187      break;
5188    case 0:  // DWORD aligned
5189      AVT = MVT::i32;
5190      ValReg = X86::EAX;
5191      Val = (Val << 8)  | Val;
5192      Val = (Val << 16) | Val;
5193      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
5194        AVT = MVT::i64;
5195        ValReg = X86::RAX;
5196        Val = (Val << 32) | Val;
5197      }
5198      break;
5199    default:  // Byte aligned
5200      AVT = MVT::i8;
5201      ValReg = X86::AL;
5202      Count = DAG.getIntPtrConstant(SizeVal);
5203      break;
5204    }
5205
5206    if (AVT.bitsGT(MVT::i8)) {
5207      unsigned UBytes = AVT.getSizeInBits() / 8;
5208      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
5209      BytesLeft = SizeVal % UBytes;
5210    }
5211
5212    Chain  = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT),
5213                              InFlag);
5214    InFlag = Chain.getValue(1);
5215  } else {
5216    AVT = MVT::i8;
5217    Count  = DAG.getIntPtrConstant(SizeVal);
5218    Chain  = DAG.getCopyToReg(Chain, X86::AL, Src, InFlag);
5219    InFlag = Chain.getValue(1);
5220  }
5221
5222  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
5223                            Count, InFlag);
5224  InFlag = Chain.getValue(1);
5225  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
5226                            Dst, InFlag);
5227  InFlag = Chain.getValue(1);
5228
5229  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5230  SmallVector<SDValue, 8> Ops;
5231  Ops.push_back(Chain);
5232  Ops.push_back(DAG.getValueType(AVT));
5233  Ops.push_back(InFlag);
5234  Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
5235
5236  if (TwoRepStos) {
5237    InFlag = Chain.getValue(1);
5238    Count  = Size;
5239    MVT CVT = Count.getValueType();
5240    SDValue Left = DAG.getNode(ISD::AND, CVT, Count,
5241                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
5242    Chain  = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
5243                              Left, InFlag);
5244    InFlag = Chain.getValue(1);
5245    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5246    Ops.clear();
5247    Ops.push_back(Chain);
5248    Ops.push_back(DAG.getValueType(MVT::i8));
5249    Ops.push_back(InFlag);
5250    Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
5251  } else if (BytesLeft) {
5252    // Handle the last 1 - 7 bytes.
5253    unsigned Offset = SizeVal - BytesLeft;
5254    MVT AddrVT = Dst.getValueType();
5255    MVT SizeVT = Size.getValueType();
5256
5257    Chain = DAG.getMemset(Chain,
5258                          DAG.getNode(ISD::ADD, AddrVT, Dst,
5259                                      DAG.getConstant(Offset, AddrVT)),
5260                          Src,
5261                          DAG.getConstant(BytesLeft, SizeVT),
5262                          Align, DstSV, DstSVOff + Offset);
5263  }
5264
5265  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
5266  return Chain;
5267}
5268
5269SDValue
5270X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG,
5271                                      SDValue Chain, SDValue Dst, SDValue Src,
5272                                      SDValue Size, unsigned Align,
5273                                      bool AlwaysInline,
5274                                      const Value *DstSV, uint64_t DstSVOff,
5275                                      const Value *SrcSV, uint64_t SrcSVOff) {
5276  // This requires the copy size to be a constant, preferrably
5277  // within a subtarget-specific limit.
5278  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5279  if (!ConstantSize)
5280    return SDValue();
5281  uint64_t SizeVal = ConstantSize->getZExtValue();
5282  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
5283    return SDValue();
5284
5285  /// If not DWORD aligned, call the library.
5286  if ((Align & 3) != 0)
5287    return SDValue();
5288
5289  // DWORD aligned
5290  MVT AVT = MVT::i32;
5291  if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
5292    AVT = MVT::i64;
5293
5294  unsigned UBytes = AVT.getSizeInBits() / 8;
5295  unsigned CountVal = SizeVal / UBytes;
5296  SDValue Count = DAG.getIntPtrConstant(CountVal);
5297  unsigned BytesLeft = SizeVal % UBytes;
5298
5299  SDValue InFlag(0, 0);
5300  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
5301                            Count, InFlag);
5302  InFlag = Chain.getValue(1);
5303  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
5304                            Dst, InFlag);
5305  InFlag = Chain.getValue(1);
5306  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI,
5307                            Src, InFlag);
5308  InFlag = Chain.getValue(1);
5309
5310  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5311  SmallVector<SDValue, 8> Ops;
5312  Ops.push_back(Chain);
5313  Ops.push_back(DAG.getValueType(AVT));
5314  Ops.push_back(InFlag);
5315  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
5316
5317  SmallVector<SDValue, 4> Results;
5318  Results.push_back(RepMovs);
5319  if (BytesLeft) {
5320    // Handle the last 1 - 7 bytes.
5321    unsigned Offset = SizeVal - BytesLeft;
5322    MVT DstVT = Dst.getValueType();
5323    MVT SrcVT = Src.getValueType();
5324    MVT SizeVT = Size.getValueType();
5325    Results.push_back(DAG.getMemcpy(Chain,
5326                                    DAG.getNode(ISD::ADD, DstVT, Dst,
5327                                                DAG.getConstant(Offset, DstVT)),
5328                                    DAG.getNode(ISD::ADD, SrcVT, Src,
5329                                                DAG.getConstant(Offset, SrcVT)),
5330                                    DAG.getConstant(BytesLeft, SizeVT),
5331                                    Align, AlwaysInline,
5332                                    DstSV, DstSVOff + Offset,
5333                                    SrcSV, SrcSVOff + Offset));
5334  }
5335
5336  return DAG.getNode(ISD::TokenFactor, MVT::Other, &Results[0], Results.size());
5337}
5338
5339/// Expand the result of: i64,outchain = READCYCLECOUNTER inchain
5340SDNode *X86TargetLowering::ExpandREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG){
5341  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5342  SDValue TheChain = N->getOperand(0);
5343  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheChain, 1);
5344  if (Subtarget->is64Bit()) {
5345    SDValue rax = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
5346    SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), X86::RDX,
5347                                       MVT::i64, rax.getValue(2));
5348    SDValue Tmp = DAG.getNode(ISD::SHL, MVT::i64, rdx,
5349                                DAG.getConstant(32, MVT::i8));
5350    SDValue Ops[] = {
5351      DAG.getNode(ISD::OR, MVT::i64, rax, Tmp), rdx.getValue(1)
5352    };
5353
5354    return DAG.getMergeValues(Ops, 2).getNode();
5355  }
5356
5357  SDValue eax = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
5358  SDValue edx = DAG.getCopyFromReg(eax.getValue(1), X86::EDX,
5359                                       MVT::i32, eax.getValue(2));
5360  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
5361  SDValue Ops[] = { eax, edx };
5362  Ops[0] = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Ops, 2);
5363
5364  // Use a MERGE_VALUES to return the value and chain.
5365  Ops[1] = edx.getValue(1);
5366  return DAG.getMergeValues(Ops, 2).getNode();
5367}
5368
5369SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
5370  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
5371
5372  if (!Subtarget->is64Bit()) {
5373    // vastart just stores the address of the VarArgsFrameIndex slot into the
5374    // memory location argument.
5375    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
5376    return DAG.getStore(Op.getOperand(0), FR,Op.getOperand(1), SV, 0);
5377  }
5378
5379  // __va_list_tag:
5380  //   gp_offset         (0 - 6 * 8)
5381  //   fp_offset         (48 - 48 + 8 * 16)
5382  //   overflow_arg_area (point to parameters coming in memory).
5383  //   reg_save_area
5384  SmallVector<SDValue, 8> MemOps;
5385  SDValue FIN = Op.getOperand(1);
5386  // Store gp_offset
5387  SDValue Store = DAG.getStore(Op.getOperand(0),
5388                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
5389                                 FIN, SV, 0);
5390  MemOps.push_back(Store);
5391
5392  // Store fp_offset
5393  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, DAG.getIntPtrConstant(4));
5394  Store = DAG.getStore(Op.getOperand(0),
5395                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
5396                       FIN, SV, 0);
5397  MemOps.push_back(Store);
5398
5399  // Store ptr to overflow_arg_area
5400  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, DAG.getIntPtrConstant(4));
5401  SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
5402  Store = DAG.getStore(Op.getOperand(0), OVFIN, FIN, SV, 0);
5403  MemOps.push_back(Store);
5404
5405  // Store ptr to reg_save_area.
5406  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, DAG.getIntPtrConstant(8));
5407  SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
5408  Store = DAG.getStore(Op.getOperand(0), RSFIN, FIN, SV, 0);
5409  MemOps.push_back(Store);
5410  return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size());
5411}
5412
5413SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
5414  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
5415  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
5416  SDValue Chain = Op.getOperand(0);
5417  SDValue SrcPtr = Op.getOperand(1);
5418  SDValue SrcSV = Op.getOperand(2);
5419
5420  assert(0 && "VAArgInst is not yet implemented for x86-64!");
5421  abort();
5422  return SDValue();
5423}
5424
5425SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
5426  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
5427  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
5428  SDValue Chain = Op.getOperand(0);
5429  SDValue DstPtr = Op.getOperand(1);
5430  SDValue SrcPtr = Op.getOperand(2);
5431  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
5432  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
5433
5434  return DAG.getMemcpy(Chain, DstPtr, SrcPtr,
5435                       DAG.getIntPtrConstant(24), 8, false,
5436                       DstSV, 0, SrcSV, 0);
5437}
5438
5439SDValue
5440X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
5441  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5442  switch (IntNo) {
5443  default: return SDValue();    // Don't custom lower most intrinsics.
5444  // Comparison intrinsics.
5445  case Intrinsic::x86_sse_comieq_ss:
5446  case Intrinsic::x86_sse_comilt_ss:
5447  case Intrinsic::x86_sse_comile_ss:
5448  case Intrinsic::x86_sse_comigt_ss:
5449  case Intrinsic::x86_sse_comige_ss:
5450  case Intrinsic::x86_sse_comineq_ss:
5451  case Intrinsic::x86_sse_ucomieq_ss:
5452  case Intrinsic::x86_sse_ucomilt_ss:
5453  case Intrinsic::x86_sse_ucomile_ss:
5454  case Intrinsic::x86_sse_ucomigt_ss:
5455  case Intrinsic::x86_sse_ucomige_ss:
5456  case Intrinsic::x86_sse_ucomineq_ss:
5457  case Intrinsic::x86_sse2_comieq_sd:
5458  case Intrinsic::x86_sse2_comilt_sd:
5459  case Intrinsic::x86_sse2_comile_sd:
5460  case Intrinsic::x86_sse2_comigt_sd:
5461  case Intrinsic::x86_sse2_comige_sd:
5462  case Intrinsic::x86_sse2_comineq_sd:
5463  case Intrinsic::x86_sse2_ucomieq_sd:
5464  case Intrinsic::x86_sse2_ucomilt_sd:
5465  case Intrinsic::x86_sse2_ucomile_sd:
5466  case Intrinsic::x86_sse2_ucomigt_sd:
5467  case Intrinsic::x86_sse2_ucomige_sd:
5468  case Intrinsic::x86_sse2_ucomineq_sd: {
5469    unsigned Opc = 0;
5470    ISD::CondCode CC = ISD::SETCC_INVALID;
5471    switch (IntNo) {
5472    default: break;
5473    case Intrinsic::x86_sse_comieq_ss:
5474    case Intrinsic::x86_sse2_comieq_sd:
5475      Opc = X86ISD::COMI;
5476      CC = ISD::SETEQ;
5477      break;
5478    case Intrinsic::x86_sse_comilt_ss:
5479    case Intrinsic::x86_sse2_comilt_sd:
5480      Opc = X86ISD::COMI;
5481      CC = ISD::SETLT;
5482      break;
5483    case Intrinsic::x86_sse_comile_ss:
5484    case Intrinsic::x86_sse2_comile_sd:
5485      Opc = X86ISD::COMI;
5486      CC = ISD::SETLE;
5487      break;
5488    case Intrinsic::x86_sse_comigt_ss:
5489    case Intrinsic::x86_sse2_comigt_sd:
5490      Opc = X86ISD::COMI;
5491      CC = ISD::SETGT;
5492      break;
5493    case Intrinsic::x86_sse_comige_ss:
5494    case Intrinsic::x86_sse2_comige_sd:
5495      Opc = X86ISD::COMI;
5496      CC = ISD::SETGE;
5497      break;
5498    case Intrinsic::x86_sse_comineq_ss:
5499    case Intrinsic::x86_sse2_comineq_sd:
5500      Opc = X86ISD::COMI;
5501      CC = ISD::SETNE;
5502      break;
5503    case Intrinsic::x86_sse_ucomieq_ss:
5504    case Intrinsic::x86_sse2_ucomieq_sd:
5505      Opc = X86ISD::UCOMI;
5506      CC = ISD::SETEQ;
5507      break;
5508    case Intrinsic::x86_sse_ucomilt_ss:
5509    case Intrinsic::x86_sse2_ucomilt_sd:
5510      Opc = X86ISD::UCOMI;
5511      CC = ISD::SETLT;
5512      break;
5513    case Intrinsic::x86_sse_ucomile_ss:
5514    case Intrinsic::x86_sse2_ucomile_sd:
5515      Opc = X86ISD::UCOMI;
5516      CC = ISD::SETLE;
5517      break;
5518    case Intrinsic::x86_sse_ucomigt_ss:
5519    case Intrinsic::x86_sse2_ucomigt_sd:
5520      Opc = X86ISD::UCOMI;
5521      CC = ISD::SETGT;
5522      break;
5523    case Intrinsic::x86_sse_ucomige_ss:
5524    case Intrinsic::x86_sse2_ucomige_sd:
5525      Opc = X86ISD::UCOMI;
5526      CC = ISD::SETGE;
5527      break;
5528    case Intrinsic::x86_sse_ucomineq_ss:
5529    case Intrinsic::x86_sse2_ucomineq_sd:
5530      Opc = X86ISD::UCOMI;
5531      CC = ISD::SETNE;
5532      break;
5533    }
5534
5535    unsigned X86CC;
5536    SDValue LHS = Op.getOperand(1);
5537    SDValue RHS = Op.getOperand(2);
5538    translateX86CC(CC, true, X86CC, LHS, RHS, DAG);
5539
5540    SDValue Cond = DAG.getNode(Opc, MVT::i32, LHS, RHS);
5541    SDValue SetCC = DAG.getNode(X86ISD::SETCC, MVT::i8,
5542                                DAG.getConstant(X86CC, MVT::i8), Cond);
5543    return DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, SetCC);
5544  }
5545
5546  // Fix vector shift instructions where the last operand is a non-immediate
5547  // i32 value.
5548  case Intrinsic::x86_sse2_pslli_w:
5549  case Intrinsic::x86_sse2_pslli_d:
5550  case Intrinsic::x86_sse2_pslli_q:
5551  case Intrinsic::x86_sse2_psrli_w:
5552  case Intrinsic::x86_sse2_psrli_d:
5553  case Intrinsic::x86_sse2_psrli_q:
5554  case Intrinsic::x86_sse2_psrai_w:
5555  case Intrinsic::x86_sse2_psrai_d:
5556  case Intrinsic::x86_mmx_pslli_w:
5557  case Intrinsic::x86_mmx_pslli_d:
5558  case Intrinsic::x86_mmx_pslli_q:
5559  case Intrinsic::x86_mmx_psrli_w:
5560  case Intrinsic::x86_mmx_psrli_d:
5561  case Intrinsic::x86_mmx_psrli_q:
5562  case Intrinsic::x86_mmx_psrai_w:
5563  case Intrinsic::x86_mmx_psrai_d: {
5564    SDValue ShAmt = Op.getOperand(2);
5565    if (isa<ConstantSDNode>(ShAmt))
5566      return SDValue();
5567
5568    unsigned NewIntNo = 0;
5569    MVT ShAmtVT = MVT::v4i32;
5570    switch (IntNo) {
5571    case Intrinsic::x86_sse2_pslli_w:
5572      NewIntNo = Intrinsic::x86_sse2_psll_w;
5573      break;
5574    case Intrinsic::x86_sse2_pslli_d:
5575      NewIntNo = Intrinsic::x86_sse2_psll_d;
5576      break;
5577    case Intrinsic::x86_sse2_pslli_q:
5578      NewIntNo = Intrinsic::x86_sse2_psll_q;
5579      break;
5580    case Intrinsic::x86_sse2_psrli_w:
5581      NewIntNo = Intrinsic::x86_sse2_psrl_w;
5582      break;
5583    case Intrinsic::x86_sse2_psrli_d:
5584      NewIntNo = Intrinsic::x86_sse2_psrl_d;
5585      break;
5586    case Intrinsic::x86_sse2_psrli_q:
5587      NewIntNo = Intrinsic::x86_sse2_psrl_q;
5588      break;
5589    case Intrinsic::x86_sse2_psrai_w:
5590      NewIntNo = Intrinsic::x86_sse2_psra_w;
5591      break;
5592    case Intrinsic::x86_sse2_psrai_d:
5593      NewIntNo = Intrinsic::x86_sse2_psra_d;
5594      break;
5595    default: {
5596      ShAmtVT = MVT::v2i32;
5597      switch (IntNo) {
5598      case Intrinsic::x86_mmx_pslli_w:
5599        NewIntNo = Intrinsic::x86_mmx_psll_w;
5600        break;
5601      case Intrinsic::x86_mmx_pslli_d:
5602        NewIntNo = Intrinsic::x86_mmx_psll_d;
5603        break;
5604      case Intrinsic::x86_mmx_pslli_q:
5605        NewIntNo = Intrinsic::x86_mmx_psll_q;
5606        break;
5607      case Intrinsic::x86_mmx_psrli_w:
5608        NewIntNo = Intrinsic::x86_mmx_psrl_w;
5609        break;
5610      case Intrinsic::x86_mmx_psrli_d:
5611        NewIntNo = Intrinsic::x86_mmx_psrl_d;
5612        break;
5613      case Intrinsic::x86_mmx_psrli_q:
5614        NewIntNo = Intrinsic::x86_mmx_psrl_q;
5615        break;
5616      case Intrinsic::x86_mmx_psrai_w:
5617        NewIntNo = Intrinsic::x86_mmx_psra_w;
5618        break;
5619      case Intrinsic::x86_mmx_psrai_d:
5620        NewIntNo = Intrinsic::x86_mmx_psra_d;
5621        break;
5622      default: abort();  // Can't reach here.
5623      }
5624      break;
5625    }
5626    }
5627    MVT VT = Op.getValueType();
5628    ShAmt = DAG.getNode(ISD::BIT_CONVERT, VT,
5629                        DAG.getNode(ISD::SCALAR_TO_VECTOR, ShAmtVT, ShAmt));
5630    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
5631                       DAG.getConstant(NewIntNo, MVT::i32),
5632                       Op.getOperand(1), ShAmt);
5633  }
5634  }
5635}
5636
5637SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
5638  // Depths > 0 not supported yet!
5639  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
5640    return SDValue();
5641
5642  // Just load the return address
5643  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
5644  return DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI, NULL, 0);
5645}
5646
5647SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
5648  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5649  MFI->setFrameAddressIsTaken(true);
5650  MVT VT = Op.getValueType();
5651  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5652  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
5653  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), FrameReg, VT);
5654  while (Depth--)
5655    FrameAddr = DAG.getLoad(VT, DAG.getEntryNode(), FrameAddr, NULL, 0);
5656  return FrameAddr;
5657}
5658
5659SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
5660                                                     SelectionDAG &DAG) {
5661  return DAG.getIntPtrConstant(2*TD->getPointerSize());
5662}
5663
5664SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
5665{
5666  MachineFunction &MF = DAG.getMachineFunction();
5667  SDValue Chain     = Op.getOperand(0);
5668  SDValue Offset    = Op.getOperand(1);
5669  SDValue Handler   = Op.getOperand(2);
5670
5671  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
5672                                  getPointerTy());
5673  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
5674
5675  SDValue StoreAddr = DAG.getNode(ISD::SUB, getPointerTy(), Frame,
5676                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
5677  StoreAddr = DAG.getNode(ISD::ADD, getPointerTy(), StoreAddr, Offset);
5678  Chain = DAG.getStore(Chain, Handler, StoreAddr, NULL, 0);
5679  Chain = DAG.getCopyToReg(Chain, StoreAddrReg, StoreAddr);
5680  MF.getRegInfo().addLiveOut(StoreAddrReg);
5681
5682  return DAG.getNode(X86ISD::EH_RETURN,
5683                     MVT::Other,
5684                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
5685}
5686
5687SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
5688                                             SelectionDAG &DAG) {
5689  SDValue Root = Op.getOperand(0);
5690  SDValue Trmp = Op.getOperand(1); // trampoline
5691  SDValue FPtr = Op.getOperand(2); // nested function
5692  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
5693
5694  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
5695
5696  const X86InstrInfo *TII =
5697    ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
5698
5699  if (Subtarget->is64Bit()) {
5700    SDValue OutChains[6];
5701
5702    // Large code-model.
5703
5704    const unsigned char JMP64r  = TII->getBaseOpcodeFor(X86::JMP64r);
5705    const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
5706
5707    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
5708    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
5709
5710    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
5711
5712    // Load the pointer to the nested function into R11.
5713    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
5714    SDValue Addr = Trmp;
5715    OutChains[0] = DAG.getStore(Root, DAG.getConstant(OpCode, MVT::i16), Addr,
5716                                TrmpAddr, 0);
5717
5718    Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(2, MVT::i64));
5719    OutChains[1] = DAG.getStore(Root, FPtr, Addr, TrmpAddr, 2, false, 2);
5720
5721    // Load the 'nest' parameter value into R10.
5722    // R10 is specified in X86CallingConv.td
5723    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
5724    Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(10, MVT::i64));
5725    OutChains[2] = DAG.getStore(Root, DAG.getConstant(OpCode, MVT::i16), Addr,
5726                                TrmpAddr, 10);
5727
5728    Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(12, MVT::i64));
5729    OutChains[3] = DAG.getStore(Root, Nest, Addr, TrmpAddr, 12, false, 2);
5730
5731    // Jump to the nested function.
5732    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
5733    Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(20, MVT::i64));
5734    OutChains[4] = DAG.getStore(Root, DAG.getConstant(OpCode, MVT::i16), Addr,
5735                                TrmpAddr, 20);
5736
5737    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
5738    Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(22, MVT::i64));
5739    OutChains[5] = DAG.getStore(Root, DAG.getConstant(ModRM, MVT::i8), Addr,
5740                                TrmpAddr, 22);
5741
5742    SDValue Ops[] =
5743      { Trmp, DAG.getNode(ISD::TokenFactor, MVT::Other, OutChains, 6) };
5744    return DAG.getMergeValues(Ops, 2);
5745  } else {
5746    const Function *Func =
5747      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
5748    unsigned CC = Func->getCallingConv();
5749    unsigned NestReg;
5750
5751    switch (CC) {
5752    default:
5753      assert(0 && "Unsupported calling convention");
5754    case CallingConv::C:
5755    case CallingConv::X86_StdCall: {
5756      // Pass 'nest' parameter in ECX.
5757      // Must be kept in sync with X86CallingConv.td
5758      NestReg = X86::ECX;
5759
5760      // Check that ECX wasn't needed by an 'inreg' parameter.
5761      const FunctionType *FTy = Func->getFunctionType();
5762      const AttrListPtr &Attrs = Func->getAttributes();
5763
5764      if (!Attrs.isEmpty() && !Func->isVarArg()) {
5765        unsigned InRegCount = 0;
5766        unsigned Idx = 1;
5767
5768        for (FunctionType::param_iterator I = FTy->param_begin(),
5769             E = FTy->param_end(); I != E; ++I, ++Idx)
5770          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
5771            // FIXME: should only count parameters that are lowered to integers.
5772            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
5773
5774        if (InRegCount > 2) {
5775          cerr << "Nest register in use - reduce number of inreg parameters!\n";
5776          abort();
5777        }
5778      }
5779      break;
5780    }
5781    case CallingConv::X86_FastCall:
5782    case CallingConv::Fast:
5783      // Pass 'nest' parameter in EAX.
5784      // Must be kept in sync with X86CallingConv.td
5785      NestReg = X86::EAX;
5786      break;
5787    }
5788
5789    SDValue OutChains[4];
5790    SDValue Addr, Disp;
5791
5792    Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(10, MVT::i32));
5793    Disp = DAG.getNode(ISD::SUB, MVT::i32, FPtr, Addr);
5794
5795    const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
5796    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
5797    OutChains[0] = DAG.getStore(Root, DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
5798                                Trmp, TrmpAddr, 0);
5799
5800    Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(1, MVT::i32));
5801    OutChains[1] = DAG.getStore(Root, Nest, Addr, TrmpAddr, 1, false, 1);
5802
5803    const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
5804    Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(5, MVT::i32));
5805    OutChains[2] = DAG.getStore(Root, DAG.getConstant(JMP, MVT::i8), Addr,
5806                                TrmpAddr, 5, false, 1);
5807
5808    Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(6, MVT::i32));
5809    OutChains[3] = DAG.getStore(Root, Disp, Addr, TrmpAddr, 6, false, 1);
5810
5811    SDValue Ops[] =
5812      { Trmp, DAG.getNode(ISD::TokenFactor, MVT::Other, OutChains, 4) };
5813    return DAG.getMergeValues(Ops, 2);
5814  }
5815}
5816
5817SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
5818  /*
5819   The rounding mode is in bits 11:10 of FPSR, and has the following
5820   settings:
5821     00 Round to nearest
5822     01 Round to -inf
5823     10 Round to +inf
5824     11 Round to 0
5825
5826  FLT_ROUNDS, on the other hand, expects the following:
5827    -1 Undefined
5828     0 Round to 0
5829     1 Round to nearest
5830     2 Round to +inf
5831     3 Round to -inf
5832
5833  To perform the conversion, we do:
5834    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
5835  */
5836
5837  MachineFunction &MF = DAG.getMachineFunction();
5838  const TargetMachine &TM = MF.getTarget();
5839  const TargetFrameInfo &TFI = *TM.getFrameInfo();
5840  unsigned StackAlignment = TFI.getStackAlignment();
5841  MVT VT = Op.getValueType();
5842
5843  // Save FP Control Word to stack slot
5844  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
5845  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5846
5847  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, MVT::Other,
5848                              DAG.getEntryNode(), StackSlot);
5849
5850  // Load FP Control Word from stack slot
5851  SDValue CWD = DAG.getLoad(MVT::i16, Chain, StackSlot, NULL, 0);
5852
5853  // Transform as necessary
5854  SDValue CWD1 =
5855    DAG.getNode(ISD::SRL, MVT::i16,
5856                DAG.getNode(ISD::AND, MVT::i16,
5857                            CWD, DAG.getConstant(0x800, MVT::i16)),
5858                DAG.getConstant(11, MVT::i8));
5859  SDValue CWD2 =
5860    DAG.getNode(ISD::SRL, MVT::i16,
5861                DAG.getNode(ISD::AND, MVT::i16,
5862                            CWD, DAG.getConstant(0x400, MVT::i16)),
5863                DAG.getConstant(9, MVT::i8));
5864
5865  SDValue RetVal =
5866    DAG.getNode(ISD::AND, MVT::i16,
5867                DAG.getNode(ISD::ADD, MVT::i16,
5868                            DAG.getNode(ISD::OR, MVT::i16, CWD1, CWD2),
5869                            DAG.getConstant(1, MVT::i16)),
5870                DAG.getConstant(3, MVT::i16));
5871
5872
5873  return DAG.getNode((VT.getSizeInBits() < 16 ?
5874                      ISD::TRUNCATE : ISD::ZERO_EXTEND), VT, RetVal);
5875}
5876
5877SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
5878  MVT VT = Op.getValueType();
5879  MVT OpVT = VT;
5880  unsigned NumBits = VT.getSizeInBits();
5881
5882  Op = Op.getOperand(0);
5883  if (VT == MVT::i8) {
5884    // Zero extend to i32 since there is not an i8 bsr.
5885    OpVT = MVT::i32;
5886    Op = DAG.getNode(ISD::ZERO_EXTEND, OpVT, Op);
5887  }
5888
5889  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
5890  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
5891  Op = DAG.getNode(X86ISD::BSR, VTs, Op);
5892
5893  // If src is zero (i.e. bsr sets ZF), returns NumBits.
5894  SmallVector<SDValue, 4> Ops;
5895  Ops.push_back(Op);
5896  Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT));
5897  Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
5898  Ops.push_back(Op.getValue(1));
5899  Op = DAG.getNode(X86ISD::CMOV, OpVT, &Ops[0], 4);
5900
5901  // Finally xor with NumBits-1.
5902  Op = DAG.getNode(ISD::XOR, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
5903
5904  if (VT == MVT::i8)
5905    Op = DAG.getNode(ISD::TRUNCATE, MVT::i8, Op);
5906  return Op;
5907}
5908
5909SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
5910  MVT VT = Op.getValueType();
5911  MVT OpVT = VT;
5912  unsigned NumBits = VT.getSizeInBits();
5913
5914  Op = Op.getOperand(0);
5915  if (VT == MVT::i8) {
5916    OpVT = MVT::i32;
5917    Op = DAG.getNode(ISD::ZERO_EXTEND, OpVT, Op);
5918  }
5919
5920  // Issue a bsf (scan bits forward) which also sets EFLAGS.
5921  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
5922  Op = DAG.getNode(X86ISD::BSF, VTs, Op);
5923
5924  // If src is zero (i.e. bsf sets ZF), returns NumBits.
5925  SmallVector<SDValue, 4> Ops;
5926  Ops.push_back(Op);
5927  Ops.push_back(DAG.getConstant(NumBits, OpVT));
5928  Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
5929  Ops.push_back(Op.getValue(1));
5930  Op = DAG.getNode(X86ISD::CMOV, OpVT, &Ops[0], 4);
5931
5932  if (VT == MVT::i8)
5933    Op = DAG.getNode(ISD::TRUNCATE, MVT::i8, Op);
5934  return Op;
5935}
5936
5937SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
5938  MVT T = Op.getValueType();
5939  unsigned Reg = 0;
5940  unsigned size = 0;
5941  switch(T.getSimpleVT()) {
5942  default:
5943    assert(false && "Invalid value type!");
5944  case MVT::i8:  Reg = X86::AL;  size = 1; break;
5945  case MVT::i16: Reg = X86::AX;  size = 2; break;
5946  case MVT::i32: Reg = X86::EAX; size = 4; break;
5947  case MVT::i64:
5948    if (Subtarget->is64Bit()) {
5949      Reg = X86::RAX; size = 8;
5950    } else //Should go away when LowerType stuff lands
5951      return SDValue(ExpandATOMIC_CMP_SWAP(Op.getNode(), DAG), 0);
5952    break;
5953  };
5954  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), Reg,
5955                                    Op.getOperand(2), SDValue());
5956  SDValue Ops[] = { cpIn.getValue(0),
5957                    Op.getOperand(1),
5958                    Op.getOperand(3),
5959                    DAG.getTargetConstant(size, MVT::i8),
5960                    cpIn.getValue(1) };
5961  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5962  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, Tys, Ops, 5);
5963  SDValue cpOut =
5964    DAG.getCopyFromReg(Result.getValue(0), Reg, T, Result.getValue(1));
5965  return cpOut;
5966}
5967
5968SDNode* X86TargetLowering::ExpandATOMIC_CMP_SWAP(SDNode* Op,
5969                                                 SelectionDAG &DAG) {
5970  MVT T = Op->getValueType(0);
5971  assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
5972  SDValue cpInL, cpInH;
5973  cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(2),
5974                      DAG.getConstant(0, MVT::i32));
5975  cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(2),
5976                      DAG.getConstant(1, MVT::i32));
5977  cpInL = DAG.getCopyToReg(Op->getOperand(0), X86::EAX,
5978                           cpInL, SDValue());
5979  cpInH = DAG.getCopyToReg(cpInL.getValue(0), X86::EDX,
5980                           cpInH, cpInL.getValue(1));
5981  SDValue swapInL, swapInH;
5982  swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3),
5983                        DAG.getConstant(0, MVT::i32));
5984  swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3),
5985                        DAG.getConstant(1, MVT::i32));
5986  swapInL = DAG.getCopyToReg(cpInH.getValue(0), X86::EBX,
5987                             swapInL, cpInH.getValue(1));
5988  swapInH = DAG.getCopyToReg(swapInL.getValue(0), X86::ECX,
5989                             swapInH, swapInL.getValue(1));
5990  SDValue Ops[] = { swapInH.getValue(0),
5991                    Op->getOperand(1),
5992                    swapInH.getValue(1) };
5993  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5994  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, Tys, Ops, 3);
5995  SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), X86::EAX, MVT::i32,
5996                                        Result.getValue(1));
5997  SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), X86::EDX, MVT::i32,
5998                                        cpOutL.getValue(2));
5999  SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
6000  SDValue ResultVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2);
6001  SDValue Vals[2] = { ResultVal, cpOutH.getValue(1) };
6002  return DAG.getMergeValues(Vals, 2).getNode();
6003}
6004
6005SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
6006  SDNode *Node = Op.getNode();
6007  MVT T = Node->getValueType(0);
6008  SDValue negOp = DAG.getNode(ISD::SUB, T,
6009                                DAG.getConstant(0, T), Node->getOperand(2));
6010  return DAG.getAtomic((Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_8 ?
6011                                        ISD::ATOMIC_LOAD_ADD_8 :
6012                        Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_16 ?
6013                                        ISD::ATOMIC_LOAD_ADD_16 :
6014                        Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_32 ?
6015                                        ISD::ATOMIC_LOAD_ADD_32 :
6016                                        ISD::ATOMIC_LOAD_ADD_64),
6017                       Node->getOperand(0),
6018                       Node->getOperand(1), negOp,
6019                       cast<AtomicSDNode>(Node)->getSrcValue(),
6020                       cast<AtomicSDNode>(Node)->getAlignment());
6021}
6022
6023/// LowerOperation - Provide custom lowering hooks for some operations.
6024///
6025SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
6026  switch (Op.getOpcode()) {
6027  default: assert(0 && "Should not custom lower this!");
6028  case ISD::ATOMIC_CMP_SWAP_8:  return LowerCMP_SWAP(Op,DAG);
6029  case ISD::ATOMIC_CMP_SWAP_16: return LowerCMP_SWAP(Op,DAG);
6030  case ISD::ATOMIC_CMP_SWAP_32: return LowerCMP_SWAP(Op,DAG);
6031  case ISD::ATOMIC_CMP_SWAP_64: return LowerCMP_SWAP(Op,DAG);
6032  case ISD::ATOMIC_LOAD_SUB_8:  return LowerLOAD_SUB(Op,DAG);
6033  case ISD::ATOMIC_LOAD_SUB_16: return LowerLOAD_SUB(Op,DAG);
6034  case ISD::ATOMIC_LOAD_SUB_32: return LowerLOAD_SUB(Op,DAG);
6035  case ISD::ATOMIC_LOAD_SUB_64: return LowerLOAD_SUB(Op,DAG);
6036  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
6037  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
6038  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6039  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
6040  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
6041  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
6042  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
6043  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
6044  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
6045  case ISD::SHL_PARTS:
6046  case ISD::SRA_PARTS:
6047  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
6048  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
6049  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
6050  case ISD::FABS:               return LowerFABS(Op, DAG);
6051  case ISD::FNEG:               return LowerFNEG(Op, DAG);
6052  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
6053  case ISD::SETCC:              return LowerSETCC(Op, DAG);
6054  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
6055  case ISD::SELECT:             return LowerSELECT(Op, DAG);
6056  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
6057  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
6058  case ISD::CALL:               return LowerCALL(Op, DAG);
6059  case ISD::RET:                return LowerRET(Op, DAG);
6060  case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
6061  case ISD::VASTART:            return LowerVASTART(Op, DAG);
6062  case ISD::VAARG:              return LowerVAARG(Op, DAG);
6063  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
6064  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6065  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
6066  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
6067  case ISD::FRAME_TO_ARGS_OFFSET:
6068                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
6069  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
6070  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
6071  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
6072  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
6073  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
6074  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
6075
6076  // FIXME: REMOVE THIS WHEN LegalizeDAGTypes lands.
6077  case ISD::READCYCLECOUNTER:
6078    return SDValue(ExpandREADCYCLECOUNTER(Op.getNode(), DAG), 0);
6079  }
6080}
6081
6082/// ReplaceNodeResults - Replace a node with an illegal result type
6083/// with a new node built out of custom code.
6084SDNode *X86TargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG) {
6085  switch (N->getOpcode()) {
6086  default: assert(0 && "Should not custom lower this!");
6087  case ISD::FP_TO_SINT:         return ExpandFP_TO_SINT(N, DAG);
6088  case ISD::READCYCLECOUNTER:   return ExpandREADCYCLECOUNTER(N, DAG);
6089  case ISD::ATOMIC_CMP_SWAP_64: return ExpandATOMIC_CMP_SWAP(N, DAG);
6090  }
6091}
6092
6093const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
6094  switch (Opcode) {
6095  default: return NULL;
6096  case X86ISD::BSF:                return "X86ISD::BSF";
6097  case X86ISD::BSR:                return "X86ISD::BSR";
6098  case X86ISD::SHLD:               return "X86ISD::SHLD";
6099  case X86ISD::SHRD:               return "X86ISD::SHRD";
6100  case X86ISD::FAND:               return "X86ISD::FAND";
6101  case X86ISD::FOR:                return "X86ISD::FOR";
6102  case X86ISD::FXOR:               return "X86ISD::FXOR";
6103  case X86ISD::FSRL:               return "X86ISD::FSRL";
6104  case X86ISD::FILD:               return "X86ISD::FILD";
6105  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
6106  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
6107  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
6108  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
6109  case X86ISD::FLD:                return "X86ISD::FLD";
6110  case X86ISD::FST:                return "X86ISD::FST";
6111  case X86ISD::CALL:               return "X86ISD::CALL";
6112  case X86ISD::TAILCALL:           return "X86ISD::TAILCALL";
6113  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
6114  case X86ISD::CMP:                return "X86ISD::CMP";
6115  case X86ISD::COMI:               return "X86ISD::COMI";
6116  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
6117  case X86ISD::SETCC:              return "X86ISD::SETCC";
6118  case X86ISD::CMOV:               return "X86ISD::CMOV";
6119  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
6120  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
6121  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
6122  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
6123  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
6124  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
6125  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
6126  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
6127  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
6128  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
6129  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
6130  case X86ISD::FMAX:               return "X86ISD::FMAX";
6131  case X86ISD::FMIN:               return "X86ISD::FMIN";
6132  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
6133  case X86ISD::FRCP:               return "X86ISD::FRCP";
6134  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
6135  case X86ISD::THREAD_POINTER:     return "X86ISD::THREAD_POINTER";
6136  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
6137  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
6138  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
6139  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
6140  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
6141  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
6142  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
6143  case X86ISD::VSHL:               return "X86ISD::VSHL";
6144  case X86ISD::VSRL:               return "X86ISD::VSRL";
6145  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
6146  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
6147  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
6148  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
6149  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
6150  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
6151  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
6152  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
6153  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
6154  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
6155  }
6156}
6157
6158// isLegalAddressingMode - Return true if the addressing mode represented
6159// by AM is legal for this target, for a load/store of the specified type.
6160bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
6161                                              const Type *Ty) const {
6162  // X86 supports extremely general addressing modes.
6163
6164  // X86 allows a sign-extended 32-bit immediate field as a displacement.
6165  if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
6166    return false;
6167
6168  if (AM.BaseGV) {
6169    // We can only fold this if we don't need an extra load.
6170    if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
6171      return false;
6172
6173    // X86-64 only supports addr of globals in small code model.
6174    if (Subtarget->is64Bit()) {
6175      if (getTargetMachine().getCodeModel() != CodeModel::Small)
6176        return false;
6177      // If lower 4G is not available, then we must use rip-relative addressing.
6178      if (AM.BaseOffs || AM.Scale > 1)
6179        return false;
6180    }
6181  }
6182
6183  switch (AM.Scale) {
6184  case 0:
6185  case 1:
6186  case 2:
6187  case 4:
6188  case 8:
6189    // These scales always work.
6190    break;
6191  case 3:
6192  case 5:
6193  case 9:
6194    // These scales are formed with basereg+scalereg.  Only accept if there is
6195    // no basereg yet.
6196    if (AM.HasBaseReg)
6197      return false;
6198    break;
6199  default:  // Other stuff never works.
6200    return false;
6201  }
6202
6203  return true;
6204}
6205
6206
6207bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
6208  if (!Ty1->isInteger() || !Ty2->isInteger())
6209    return false;
6210  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
6211  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
6212  if (NumBits1 <= NumBits2)
6213    return false;
6214  return Subtarget->is64Bit() || NumBits1 < 64;
6215}
6216
6217bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
6218  if (!VT1.isInteger() || !VT2.isInteger())
6219    return false;
6220  unsigned NumBits1 = VT1.getSizeInBits();
6221  unsigned NumBits2 = VT2.getSizeInBits();
6222  if (NumBits1 <= NumBits2)
6223    return false;
6224  return Subtarget->is64Bit() || NumBits1 < 64;
6225}
6226
6227/// isShuffleMaskLegal - Targets can use this to indicate that they only
6228/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
6229/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
6230/// are assumed to be legal.
6231bool
6232X86TargetLowering::isShuffleMaskLegal(SDValue Mask, MVT VT) const {
6233  // Only do shuffles on 128-bit vector types for now.
6234  if (VT.getSizeInBits() == 64) return false;
6235  return (Mask.getNode()->getNumOperands() <= 4 ||
6236          isIdentityMask(Mask.getNode()) ||
6237          isIdentityMask(Mask.getNode(), true) ||
6238          isSplatMask(Mask.getNode())  ||
6239          isPSHUFHW_PSHUFLWMask(Mask.getNode()) ||
6240          X86::isUNPCKLMask(Mask.getNode()) ||
6241          X86::isUNPCKHMask(Mask.getNode()) ||
6242          X86::isUNPCKL_v_undef_Mask(Mask.getNode()) ||
6243          X86::isUNPCKH_v_undef_Mask(Mask.getNode()));
6244}
6245
6246bool
6247X86TargetLowering::isVectorClearMaskLegal(const std::vector<SDValue> &BVOps,
6248                                          MVT EVT, SelectionDAG &DAG) const {
6249  unsigned NumElts = BVOps.size();
6250  // Only do shuffles on 128-bit vector types for now.
6251  if (EVT.getSizeInBits() * NumElts == 64) return false;
6252  if (NumElts == 2) return true;
6253  if (NumElts == 4) {
6254    return (isMOVLMask(&BVOps[0], 4)  ||
6255            isCommutedMOVL(&BVOps[0], 4, true) ||
6256            isSHUFPMask(&BVOps[0], 4) ||
6257            isCommutedSHUFP(&BVOps[0], 4));
6258  }
6259  return false;
6260}
6261
6262//===----------------------------------------------------------------------===//
6263//                           X86 Scheduler Hooks
6264//===----------------------------------------------------------------------===//
6265
6266// private utility function
6267MachineBasicBlock *
6268X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
6269                                                       MachineBasicBlock *MBB,
6270                                                       unsigned regOpc,
6271                                                       unsigned immOpc,
6272                                                       unsigned LoadOpc,
6273                                                       unsigned CXchgOpc,
6274                                                       unsigned copyOpc,
6275                                                       unsigned notOpc,
6276                                                       unsigned EAXreg,
6277                                                       TargetRegisterClass *RC,
6278                                                       bool invSrc) {
6279  // For the atomic bitwise operator, we generate
6280  //   thisMBB:
6281  //   newMBB:
6282  //     ld  t1 = [bitinstr.addr]
6283  //     op  t2 = t1, [bitinstr.val]
6284  //     mov EAX = t1
6285  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
6286  //     bz  newMBB
6287  //     fallthrough -->nextMBB
6288  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6289  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
6290  MachineFunction::iterator MBBIter = MBB;
6291  ++MBBIter;
6292
6293  /// First build the CFG
6294  MachineFunction *F = MBB->getParent();
6295  MachineBasicBlock *thisMBB = MBB;
6296  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
6297  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
6298  F->insert(MBBIter, newMBB);
6299  F->insert(MBBIter, nextMBB);
6300
6301  // Move all successors to thisMBB to nextMBB
6302  nextMBB->transferSuccessors(thisMBB);
6303
6304  // Update thisMBB to fall through to newMBB
6305  thisMBB->addSuccessor(newMBB);
6306
6307  // newMBB jumps to itself and fall through to nextMBB
6308  newMBB->addSuccessor(nextMBB);
6309  newMBB->addSuccessor(newMBB);
6310
6311  // Insert instructions into newMBB based on incoming instruction
6312  assert(bInstr->getNumOperands() < 8 && "unexpected number of operands");
6313  MachineOperand& destOper = bInstr->getOperand(0);
6314  MachineOperand* argOpers[6];
6315  int numArgs = bInstr->getNumOperands() - 1;
6316  for (int i=0; i < numArgs; ++i)
6317    argOpers[i] = &bInstr->getOperand(i+1);
6318
6319  // x86 address has 4 operands: base, index, scale, and displacement
6320  int lastAddrIndx = 3; // [0,3]
6321  int valArgIndx = 4;
6322
6323  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
6324  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(LoadOpc), t1);
6325  for (int i=0; i <= lastAddrIndx; ++i)
6326    (*MIB).addOperand(*argOpers[i]);
6327
6328  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
6329  if (invSrc) {
6330    MIB = BuildMI(newMBB, TII->get(notOpc), tt).addReg(t1);
6331  }
6332  else
6333    tt = t1;
6334
6335  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
6336  assert((argOpers[valArgIndx]->isRegister() ||
6337          argOpers[valArgIndx]->isImmediate()) &&
6338         "invalid operand");
6339  if (argOpers[valArgIndx]->isRegister())
6340    MIB = BuildMI(newMBB, TII->get(regOpc), t2);
6341  else
6342    MIB = BuildMI(newMBB, TII->get(immOpc), t2);
6343  MIB.addReg(tt);
6344  (*MIB).addOperand(*argOpers[valArgIndx]);
6345
6346  MIB = BuildMI(newMBB, TII->get(copyOpc), EAXreg);
6347  MIB.addReg(t1);
6348
6349  MIB = BuildMI(newMBB, TII->get(CXchgOpc));
6350  for (int i=0; i <= lastAddrIndx; ++i)
6351    (*MIB).addOperand(*argOpers[i]);
6352  MIB.addReg(t2);
6353  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
6354  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
6355
6356  MIB = BuildMI(newMBB, TII->get(copyOpc), destOper.getReg());
6357  MIB.addReg(EAXreg);
6358
6359  // insert branch
6360  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
6361
6362  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
6363  return nextMBB;
6364}
6365
6366// private utility function
6367MachineBasicBlock *
6368X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
6369                                                      MachineBasicBlock *MBB,
6370                                                      unsigned cmovOpc) {
6371  // For the atomic min/max operator, we generate
6372  //   thisMBB:
6373  //   newMBB:
6374  //     ld t1 = [min/max.addr]
6375  //     mov t2 = [min/max.val]
6376  //     cmp  t1, t2
6377  //     cmov[cond] t2 = t1
6378  //     mov EAX = t1
6379  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
6380  //     bz   newMBB
6381  //     fallthrough -->nextMBB
6382  //
6383  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6384  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
6385  MachineFunction::iterator MBBIter = MBB;
6386  ++MBBIter;
6387
6388  /// First build the CFG
6389  MachineFunction *F = MBB->getParent();
6390  MachineBasicBlock *thisMBB = MBB;
6391  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
6392  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
6393  F->insert(MBBIter, newMBB);
6394  F->insert(MBBIter, nextMBB);
6395
6396  // Move all successors to thisMBB to nextMBB
6397  nextMBB->transferSuccessors(thisMBB);
6398
6399  // Update thisMBB to fall through to newMBB
6400  thisMBB->addSuccessor(newMBB);
6401
6402  // newMBB jumps to newMBB and fall through to nextMBB
6403  newMBB->addSuccessor(nextMBB);
6404  newMBB->addSuccessor(newMBB);
6405
6406  // Insert instructions into newMBB based on incoming instruction
6407  assert(mInstr->getNumOperands() < 8 && "unexpected number of operands");
6408  MachineOperand& destOper = mInstr->getOperand(0);
6409  MachineOperand* argOpers[6];
6410  int numArgs = mInstr->getNumOperands() - 1;
6411  for (int i=0; i < numArgs; ++i)
6412    argOpers[i] = &mInstr->getOperand(i+1);
6413
6414  // x86 address has 4 operands: base, index, scale, and displacement
6415  int lastAddrIndx = 3; // [0,3]
6416  int valArgIndx = 4;
6417
6418  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
6419  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), t1);
6420  for (int i=0; i <= lastAddrIndx; ++i)
6421    (*MIB).addOperand(*argOpers[i]);
6422
6423  // We only support register and immediate values
6424  assert((argOpers[valArgIndx]->isRegister() ||
6425          argOpers[valArgIndx]->isImmediate()) &&
6426         "invalid operand");
6427
6428  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
6429  if (argOpers[valArgIndx]->isRegister())
6430    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
6431  else
6432    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
6433  (*MIB).addOperand(*argOpers[valArgIndx]);
6434
6435  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), X86::EAX);
6436  MIB.addReg(t1);
6437
6438  MIB = BuildMI(newMBB, TII->get(X86::CMP32rr));
6439  MIB.addReg(t1);
6440  MIB.addReg(t2);
6441
6442  // Generate movc
6443  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
6444  MIB = BuildMI(newMBB, TII->get(cmovOpc),t3);
6445  MIB.addReg(t2);
6446  MIB.addReg(t1);
6447
6448  // Cmp and exchange if none has modified the memory location
6449  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
6450  for (int i=0; i <= lastAddrIndx; ++i)
6451    (*MIB).addOperand(*argOpers[i]);
6452  MIB.addReg(t3);
6453  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
6454  (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
6455
6456  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
6457  MIB.addReg(X86::EAX);
6458
6459  // insert branch
6460  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
6461
6462  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
6463  return nextMBB;
6464}
6465
6466
6467MachineBasicBlock *
6468X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
6469                                               MachineBasicBlock *BB) {
6470  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6471  switch (MI->getOpcode()) {
6472  default: assert(false && "Unexpected instr type to insert");
6473  case X86::CMOV_FR32:
6474  case X86::CMOV_FR64:
6475  case X86::CMOV_V4F32:
6476  case X86::CMOV_V2F64:
6477  case X86::CMOV_V2I64: {
6478    // To "insert" a SELECT_CC instruction, we actually have to insert the
6479    // diamond control-flow pattern.  The incoming instruction knows the
6480    // destination vreg to set, the condition code register to branch on, the
6481    // true/false values to select between, and a branch opcode to use.
6482    const BasicBlock *LLVM_BB = BB->getBasicBlock();
6483    MachineFunction::iterator It = BB;
6484    ++It;
6485
6486    //  thisMBB:
6487    //  ...
6488    //   TrueVal = ...
6489    //   cmpTY ccX, r1, r2
6490    //   bCC copy1MBB
6491    //   fallthrough --> copy0MBB
6492    MachineBasicBlock *thisMBB = BB;
6493    MachineFunction *F = BB->getParent();
6494    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
6495    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
6496    unsigned Opc =
6497      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
6498    BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB);
6499    F->insert(It, copy0MBB);
6500    F->insert(It, sinkMBB);
6501    // Update machine-CFG edges by transferring all successors of the current
6502    // block to the new block which will contain the Phi node for the select.
6503    sinkMBB->transferSuccessors(BB);
6504
6505    // Add the true and fallthrough blocks as its successors.
6506    BB->addSuccessor(copy0MBB);
6507    BB->addSuccessor(sinkMBB);
6508
6509    //  copy0MBB:
6510    //   %FalseValue = ...
6511    //   # fallthrough to sinkMBB
6512    BB = copy0MBB;
6513
6514    // Update machine-CFG edges
6515    BB->addSuccessor(sinkMBB);
6516
6517    //  sinkMBB:
6518    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
6519    //  ...
6520    BB = sinkMBB;
6521    BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg())
6522      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
6523      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
6524
6525    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
6526    return BB;
6527  }
6528
6529  case X86::FP32_TO_INT16_IN_MEM:
6530  case X86::FP32_TO_INT32_IN_MEM:
6531  case X86::FP32_TO_INT64_IN_MEM:
6532  case X86::FP64_TO_INT16_IN_MEM:
6533  case X86::FP64_TO_INT32_IN_MEM:
6534  case X86::FP64_TO_INT64_IN_MEM:
6535  case X86::FP80_TO_INT16_IN_MEM:
6536  case X86::FP80_TO_INT32_IN_MEM:
6537  case X86::FP80_TO_INT64_IN_MEM: {
6538    // Change the floating point control register to use "round towards zero"
6539    // mode when truncating to an integer value.
6540    MachineFunction *F = BB->getParent();
6541    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
6542    addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
6543
6544    // Load the old value of the high byte of the control word...
6545    unsigned OldCW =
6546      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
6547    addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx);
6548
6549    // Set the high part to be round to zero...
6550    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx)
6551      .addImm(0xC7F);
6552
6553    // Reload the modified control word now...
6554    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
6555
6556    // Restore the memory image of control word to original value
6557    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx)
6558      .addReg(OldCW);
6559
6560    // Get the X86 opcode to use.
6561    unsigned Opc;
6562    switch (MI->getOpcode()) {
6563    default: assert(0 && "illegal opcode!");
6564    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
6565    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
6566    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
6567    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
6568    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
6569    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
6570    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
6571    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
6572    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
6573    }
6574
6575    X86AddressMode AM;
6576    MachineOperand &Op = MI->getOperand(0);
6577    if (Op.isRegister()) {
6578      AM.BaseType = X86AddressMode::RegBase;
6579      AM.Base.Reg = Op.getReg();
6580    } else {
6581      AM.BaseType = X86AddressMode::FrameIndexBase;
6582      AM.Base.FrameIndex = Op.getIndex();
6583    }
6584    Op = MI->getOperand(1);
6585    if (Op.isImmediate())
6586      AM.Scale = Op.getImm();
6587    Op = MI->getOperand(2);
6588    if (Op.isImmediate())
6589      AM.IndexReg = Op.getImm();
6590    Op = MI->getOperand(3);
6591    if (Op.isGlobalAddress()) {
6592      AM.GV = Op.getGlobal();
6593    } else {
6594      AM.Disp = Op.getImm();
6595    }
6596    addFullAddress(BuildMI(BB, TII->get(Opc)), AM)
6597                      .addReg(MI->getOperand(4).getReg());
6598
6599    // Reload the original control word now.
6600    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
6601
6602    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
6603    return BB;
6604  }
6605  case X86::ATOMAND32:
6606    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
6607                                               X86::AND32ri, X86::MOV32rm,
6608                                               X86::LCMPXCHG32, X86::MOV32rr,
6609                                               X86::NOT32r, X86::EAX,
6610                                               X86::GR32RegisterClass);
6611  case X86::ATOMOR32:
6612    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
6613                                               X86::OR32ri, X86::MOV32rm,
6614                                               X86::LCMPXCHG32, X86::MOV32rr,
6615                                               X86::NOT32r, X86::EAX,
6616                                               X86::GR32RegisterClass);
6617  case X86::ATOMXOR32:
6618    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
6619                                               X86::XOR32ri, X86::MOV32rm,
6620                                               X86::LCMPXCHG32, X86::MOV32rr,
6621                                               X86::NOT32r, X86::EAX,
6622                                               X86::GR32RegisterClass);
6623  case X86::ATOMNAND32:
6624    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
6625                                               X86::AND32ri, X86::MOV32rm,
6626                                               X86::LCMPXCHG32, X86::MOV32rr,
6627                                               X86::NOT32r, X86::EAX,
6628                                               X86::GR32RegisterClass, true);
6629  case X86::ATOMMIN32:
6630    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
6631  case X86::ATOMMAX32:
6632    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
6633  case X86::ATOMUMIN32:
6634    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
6635  case X86::ATOMUMAX32:
6636    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
6637
6638  case X86::ATOMAND16:
6639    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
6640                                               X86::AND16ri, X86::MOV16rm,
6641                                               X86::LCMPXCHG16, X86::MOV16rr,
6642                                               X86::NOT16r, X86::AX,
6643                                               X86::GR16RegisterClass);
6644  case X86::ATOMOR16:
6645    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
6646                                               X86::OR16ri, X86::MOV16rm,
6647                                               X86::LCMPXCHG16, X86::MOV16rr,
6648                                               X86::NOT16r, X86::AX,
6649                                               X86::GR16RegisterClass);
6650  case X86::ATOMXOR16:
6651    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
6652                                               X86::XOR16ri, X86::MOV16rm,
6653                                               X86::LCMPXCHG16, X86::MOV16rr,
6654                                               X86::NOT16r, X86::AX,
6655                                               X86::GR16RegisterClass);
6656  case X86::ATOMNAND16:
6657    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
6658                                               X86::AND16ri, X86::MOV16rm,
6659                                               X86::LCMPXCHG16, X86::MOV16rr,
6660                                               X86::NOT16r, X86::AX,
6661                                               X86::GR16RegisterClass, true);
6662  case X86::ATOMMIN16:
6663    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
6664  case X86::ATOMMAX16:
6665    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
6666  case X86::ATOMUMIN16:
6667    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
6668  case X86::ATOMUMAX16:
6669    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
6670
6671  case X86::ATOMAND8:
6672    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
6673                                               X86::AND8ri, X86::MOV8rm,
6674                                               X86::LCMPXCHG8, X86::MOV8rr,
6675                                               X86::NOT8r, X86::AL,
6676                                               X86::GR8RegisterClass);
6677  case X86::ATOMOR8:
6678    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
6679                                               X86::OR8ri, X86::MOV8rm,
6680                                               X86::LCMPXCHG8, X86::MOV8rr,
6681                                               X86::NOT8r, X86::AL,
6682                                               X86::GR8RegisterClass);
6683  case X86::ATOMXOR8:
6684    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
6685                                               X86::XOR8ri, X86::MOV8rm,
6686                                               X86::LCMPXCHG8, X86::MOV8rr,
6687                                               X86::NOT8r, X86::AL,
6688                                               X86::GR8RegisterClass);
6689  case X86::ATOMNAND8:
6690    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
6691                                               X86::AND8ri, X86::MOV8rm,
6692                                               X86::LCMPXCHG8, X86::MOV8rr,
6693                                               X86::NOT8r, X86::AL,
6694                                               X86::GR8RegisterClass, true);
6695  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
6696  case X86::ATOMAND64:
6697    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
6698                                               X86::AND64ri32, X86::MOV64rm,
6699                                               X86::LCMPXCHG64, X86::MOV64rr,
6700                                               X86::NOT64r, X86::RAX,
6701                                               X86::GR64RegisterClass);
6702  case X86::ATOMOR64:
6703    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
6704                                               X86::OR64ri32, X86::MOV64rm,
6705                                               X86::LCMPXCHG64, X86::MOV64rr,
6706                                               X86::NOT64r, X86::RAX,
6707                                               X86::GR64RegisterClass);
6708  case X86::ATOMXOR64:
6709    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
6710                                               X86::XOR64ri32, X86::MOV64rm,
6711                                               X86::LCMPXCHG64, X86::MOV64rr,
6712                                               X86::NOT64r, X86::RAX,
6713                                               X86::GR64RegisterClass);
6714  case X86::ATOMNAND64:
6715    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
6716                                               X86::AND64ri32, X86::MOV64rm,
6717                                               X86::LCMPXCHG64, X86::MOV64rr,
6718                                               X86::NOT64r, X86::RAX,
6719                                               X86::GR64RegisterClass, true);
6720  case X86::ATOMMIN64:
6721    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
6722  case X86::ATOMMAX64:
6723    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
6724  case X86::ATOMUMIN64:
6725    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
6726  case X86::ATOMUMAX64:
6727    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
6728  }
6729}
6730
6731//===----------------------------------------------------------------------===//
6732//                           X86 Optimization Hooks
6733//===----------------------------------------------------------------------===//
6734
6735void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
6736                                                       const APInt &Mask,
6737                                                       APInt &KnownZero,
6738                                                       APInt &KnownOne,
6739                                                       const SelectionDAG &DAG,
6740                                                       unsigned Depth) const {
6741  unsigned Opc = Op.getOpcode();
6742  assert((Opc >= ISD::BUILTIN_OP_END ||
6743          Opc == ISD::INTRINSIC_WO_CHAIN ||
6744          Opc == ISD::INTRINSIC_W_CHAIN ||
6745          Opc == ISD::INTRINSIC_VOID) &&
6746         "Should use MaskedValueIsZero if you don't know whether Op"
6747         " is a target node!");
6748
6749  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
6750  switch (Opc) {
6751  default: break;
6752  case X86ISD::SETCC:
6753    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
6754                                       Mask.getBitWidth() - 1);
6755    break;
6756  }
6757}
6758
6759/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
6760/// node is a GlobalAddress + offset.
6761bool X86TargetLowering::isGAPlusOffset(SDNode *N,
6762                                       GlobalValue* &GA, int64_t &Offset) const{
6763  if (N->getOpcode() == X86ISD::Wrapper) {
6764    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
6765      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
6766      return true;
6767    }
6768  }
6769  return TargetLowering::isGAPlusOffset(N, GA, Offset);
6770}
6771
6772static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
6773                               const TargetLowering &TLI) {
6774  GlobalValue *GV;
6775  int64_t Offset = 0;
6776  if (TLI.isGAPlusOffset(Base, GV, Offset))
6777    return (GV->getAlignment() >= N && (Offset % N) == 0);
6778  // DAG combine handles the stack object case.
6779  return false;
6780}
6781
6782static bool EltsFromConsecutiveLoads(SDNode *N, SDValue PermMask,
6783                                     unsigned NumElems, MVT EVT,
6784                                     SDNode *&Base,
6785                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
6786                                     const TargetLowering &TLI) {
6787  Base = NULL;
6788  for (unsigned i = 0; i < NumElems; ++i) {
6789    SDValue Idx = PermMask.getOperand(i);
6790    if (Idx.getOpcode() == ISD::UNDEF) {
6791      if (!Base)
6792        return false;
6793      continue;
6794    }
6795
6796    SDValue Elt = DAG.getShuffleScalarElt(N, i);
6797    if (!Elt.getNode() ||
6798        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6799      return false;
6800    if (!Base) {
6801      Base = Elt.getNode();
6802      if (Base->getOpcode() == ISD::UNDEF)
6803        return false;
6804      continue;
6805    }
6806    if (Elt.getOpcode() == ISD::UNDEF)
6807      continue;
6808
6809    if (!TLI.isConsecutiveLoad(Elt.getNode(), Base,
6810                               EVT.getSizeInBits()/8, i, MFI))
6811      return false;
6812  }
6813  return true;
6814}
6815
6816/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
6817/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
6818/// if the load addresses are consecutive, non-overlapping, and in the right
6819/// order.
6820static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
6821                                       const TargetLowering &TLI) {
6822  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6823  MVT VT = N->getValueType(0);
6824  MVT EVT = VT.getVectorElementType();
6825  SDValue PermMask = N->getOperand(2);
6826  unsigned NumElems = PermMask.getNumOperands();
6827  SDNode *Base = NULL;
6828  if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, Base,
6829                                DAG, MFI, TLI))
6830    return SDValue();
6831
6832  LoadSDNode *LD = cast<LoadSDNode>(Base);
6833  if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI))
6834    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
6835                       LD->getSrcValueOffset(), LD->isVolatile());
6836  return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
6837                     LD->getSrcValueOffset(), LD->isVolatile(),
6838                     LD->getAlignment());
6839}
6840
6841/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
6842static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
6843                                         const X86Subtarget *Subtarget,
6844                                         const TargetLowering &TLI) {
6845  unsigned NumOps = N->getNumOperands();
6846
6847  // Ignore single operand BUILD_VECTOR.
6848  if (NumOps == 1)
6849    return SDValue();
6850
6851  MVT VT = N->getValueType(0);
6852  MVT EVT = VT.getVectorElementType();
6853  if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
6854    // We are looking for load i64 and zero extend. We want to transform
6855    // it before legalizer has a chance to expand it. Also look for i64
6856    // BUILD_PAIR bit casted to f64.
6857    return SDValue();
6858  // This must be an insertion into a zero vector.
6859  SDValue HighElt = N->getOperand(1);
6860  if (!isZeroNode(HighElt))
6861    return SDValue();
6862
6863  // Value must be a load.
6864  SDNode *Base = N->getOperand(0).getNode();
6865  if (!isa<LoadSDNode>(Base)) {
6866    if (Base->getOpcode() != ISD::BIT_CONVERT)
6867      return SDValue();
6868    Base = Base->getOperand(0).getNode();
6869    if (!isa<LoadSDNode>(Base))
6870      return SDValue();
6871  }
6872
6873  // Transform it into VZEXT_LOAD addr.
6874  LoadSDNode *LD = cast<LoadSDNode>(Base);
6875
6876  // Load must not be an extload.
6877  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
6878    return SDValue();
6879
6880  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6881  SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
6882  SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, Tys, Ops, 2);
6883  DAG.ReplaceAllUsesOfValueWith(SDValue(Base, 1), ResNode.getValue(1));
6884  return ResNode;
6885}
6886
6887/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
6888static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
6889                                      const X86Subtarget *Subtarget) {
6890  SDValue Cond = N->getOperand(0);
6891
6892  // If we have SSE[12] support, try to form min/max nodes.
6893  if (Subtarget->hasSSE2() &&
6894      (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
6895    if (Cond.getOpcode() == ISD::SETCC) {
6896      // Get the LHS/RHS of the select.
6897      SDValue LHS = N->getOperand(1);
6898      SDValue RHS = N->getOperand(2);
6899      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
6900
6901      unsigned Opcode = 0;
6902      if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
6903        switch (CC) {
6904        default: break;
6905        case ISD::SETOLE: // (X <= Y) ? X : Y -> min
6906        case ISD::SETULE:
6907        case ISD::SETLE:
6908          if (!UnsafeFPMath) break;
6909          // FALL THROUGH.
6910        case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
6911        case ISD::SETLT:
6912          Opcode = X86ISD::FMIN;
6913          break;
6914
6915        case ISD::SETOGT: // (X > Y) ? X : Y -> max
6916        case ISD::SETUGT:
6917        case ISD::SETGT:
6918          if (!UnsafeFPMath) break;
6919          // FALL THROUGH.
6920        case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
6921        case ISD::SETGE:
6922          Opcode = X86ISD::FMAX;
6923          break;
6924        }
6925      } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
6926        switch (CC) {
6927        default: break;
6928        case ISD::SETOGT: // (X > Y) ? Y : X -> min
6929        case ISD::SETUGT:
6930        case ISD::SETGT:
6931          if (!UnsafeFPMath) break;
6932          // FALL THROUGH.
6933        case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
6934        case ISD::SETGE:
6935          Opcode = X86ISD::FMIN;
6936          break;
6937
6938        case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
6939        case ISD::SETULE:
6940        case ISD::SETLE:
6941          if (!UnsafeFPMath) break;
6942          // FALL THROUGH.
6943        case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
6944        case ISD::SETLT:
6945          Opcode = X86ISD::FMAX;
6946          break;
6947        }
6948      }
6949
6950      if (Opcode)
6951        return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS);
6952    }
6953
6954  }
6955
6956  return SDValue();
6957}
6958
6959/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
6960static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
6961                                     const X86Subtarget *Subtarget) {
6962  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
6963  // the FP state in cases where an emms may be missing.
6964  // A preferable solution to the general problem is to figure out the right
6965  // places to insert EMMS.  This qualifies as a quick hack.
6966  StoreSDNode *St = cast<StoreSDNode>(N);
6967  if (St->getValue().getValueType().isVector() &&
6968      St->getValue().getValueType().getSizeInBits() == 64 &&
6969      isa<LoadSDNode>(St->getValue()) &&
6970      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
6971      St->getChain().hasOneUse() && !St->isVolatile()) {
6972    SDNode* LdVal = St->getValue().getNode();
6973    LoadSDNode *Ld = 0;
6974    int TokenFactorIndex = -1;
6975    SmallVector<SDValue, 8> Ops;
6976    SDNode* ChainVal = St->getChain().getNode();
6977    // Must be a store of a load.  We currently handle two cases:  the load
6978    // is a direct child, and it's under an intervening TokenFactor.  It is
6979    // possible to dig deeper under nested TokenFactors.
6980    if (ChainVal == LdVal)
6981      Ld = cast<LoadSDNode>(St->getChain());
6982    else if (St->getValue().hasOneUse() &&
6983             ChainVal->getOpcode() == ISD::TokenFactor) {
6984      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
6985        if (ChainVal->getOperand(i).getNode() == LdVal) {
6986          TokenFactorIndex = i;
6987          Ld = cast<LoadSDNode>(St->getValue());
6988        } else
6989          Ops.push_back(ChainVal->getOperand(i));
6990      }
6991    }
6992    if (Ld) {
6993      // If we are a 64-bit capable x86, lower to a single movq load/store pair.
6994      if (Subtarget->is64Bit()) {
6995        SDValue NewLd = DAG.getLoad(MVT::i64, Ld->getChain(),
6996                                      Ld->getBasePtr(), Ld->getSrcValue(),
6997                                      Ld->getSrcValueOffset(), Ld->isVolatile(),
6998                                      Ld->getAlignment());
6999        SDValue NewChain = NewLd.getValue(1);
7000        if (TokenFactorIndex != -1) {
7001          Ops.push_back(NewChain);
7002          NewChain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Ops[0],
7003                                 Ops.size());
7004        }
7005        return DAG.getStore(NewChain, NewLd, St->getBasePtr(),
7006                            St->getSrcValue(), St->getSrcValueOffset(),
7007                            St->isVolatile(), St->getAlignment());
7008      }
7009
7010      // Otherwise, lower to two 32-bit copies.
7011      SDValue LoAddr = Ld->getBasePtr();
7012      SDValue HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr,
7013                                     DAG.getConstant(4, MVT::i32));
7014
7015      SDValue LoLd = DAG.getLoad(MVT::i32, Ld->getChain(), LoAddr,
7016                                   Ld->getSrcValue(), Ld->getSrcValueOffset(),
7017                                   Ld->isVolatile(), Ld->getAlignment());
7018      SDValue HiLd = DAG.getLoad(MVT::i32, Ld->getChain(), HiAddr,
7019                                   Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
7020                                   Ld->isVolatile(),
7021                                   MinAlign(Ld->getAlignment(), 4));
7022
7023      SDValue NewChain = LoLd.getValue(1);
7024      if (TokenFactorIndex != -1) {
7025        Ops.push_back(LoLd);
7026        Ops.push_back(HiLd);
7027        NewChain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Ops[0],
7028                               Ops.size());
7029      }
7030
7031      LoAddr = St->getBasePtr();
7032      HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr,
7033                           DAG.getConstant(4, MVT::i32));
7034
7035      SDValue LoSt = DAG.getStore(NewChain, LoLd, LoAddr,
7036                          St->getSrcValue(), St->getSrcValueOffset(),
7037                          St->isVolatile(), St->getAlignment());
7038      SDValue HiSt = DAG.getStore(NewChain, HiLd, HiAddr,
7039                                    St->getSrcValue(),
7040                                    St->getSrcValueOffset() + 4,
7041                                    St->isVolatile(),
7042                                    MinAlign(St->getAlignment(), 4));
7043      return DAG.getNode(ISD::TokenFactor, MVT::Other, LoSt, HiSt);
7044    }
7045  }
7046  return SDValue();
7047}
7048
7049/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
7050/// X86ISD::FXOR nodes.
7051static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
7052  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
7053  // F[X]OR(0.0, x) -> x
7054  // F[X]OR(x, 0.0) -> x
7055  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
7056    if (C->getValueAPF().isPosZero())
7057      return N->getOperand(1);
7058  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
7059    if (C->getValueAPF().isPosZero())
7060      return N->getOperand(0);
7061  return SDValue();
7062}
7063
7064/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
7065static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
7066  // FAND(0.0, x) -> 0.0
7067  // FAND(x, 0.0) -> 0.0
7068  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
7069    if (C->getValueAPF().isPosZero())
7070      return N->getOperand(0);
7071  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
7072    if (C->getValueAPF().isPosZero())
7073      return N->getOperand(1);
7074  return SDValue();
7075}
7076
7077
7078SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
7079                                               DAGCombinerInfo &DCI) const {
7080  SelectionDAG &DAG = DCI.DAG;
7081  switch (N->getOpcode()) {
7082  default: break;
7083  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
7084  case ISD::BUILD_VECTOR:
7085    return PerformBuildVectorCombine(N, DAG, Subtarget, *this);
7086  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
7087  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
7088  case X86ISD::FXOR:
7089  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
7090  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
7091  }
7092
7093  return SDValue();
7094}
7095
7096//===----------------------------------------------------------------------===//
7097//                           X86 Inline Assembly Support
7098//===----------------------------------------------------------------------===//
7099
7100/// getConstraintType - Given a constraint letter, return the type of
7101/// constraint it is for this target.
7102X86TargetLowering::ConstraintType
7103X86TargetLowering::getConstraintType(const std::string &Constraint) const {
7104  if (Constraint.size() == 1) {
7105    switch (Constraint[0]) {
7106    case 'A':
7107    case 'f':
7108    case 'r':
7109    case 'R':
7110    case 'l':
7111    case 'q':
7112    case 'Q':
7113    case 'x':
7114    case 'y':
7115    case 'Y':
7116      return C_RegisterClass;
7117    default:
7118      break;
7119    }
7120  }
7121  return TargetLowering::getConstraintType(Constraint);
7122}
7123
7124/// LowerXConstraint - try to replace an X constraint, which matches anything,
7125/// with another that has more specific requirements based on the type of the
7126/// corresponding operand.
7127const char *X86TargetLowering::
7128LowerXConstraint(MVT ConstraintVT) const {
7129  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
7130  // 'f' like normal targets.
7131  if (ConstraintVT.isFloatingPoint()) {
7132    if (Subtarget->hasSSE2())
7133      return "Y";
7134    if (Subtarget->hasSSE1())
7135      return "x";
7136  }
7137
7138  return TargetLowering::LowerXConstraint(ConstraintVT);
7139}
7140
7141/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
7142/// vector.  If it is invalid, don't add anything to Ops.
7143void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
7144                                                     char Constraint,
7145                                                     bool hasMemory,
7146                                                     std::vector<SDValue>&Ops,
7147                                                     SelectionDAG &DAG) const {
7148  SDValue Result(0, 0);
7149
7150  switch (Constraint) {
7151  default: break;
7152  case 'I':
7153    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
7154      if (C->getZExtValue() <= 31) {
7155        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
7156        break;
7157      }
7158    }
7159    return;
7160  case 'J':
7161    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
7162      if (C->getZExtValue() <= 63) {
7163        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
7164        break;
7165      }
7166    }
7167    return;
7168  case 'N':
7169    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
7170      if (C->getZExtValue() <= 255) {
7171        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
7172        break;
7173      }
7174    }
7175    return;
7176  case 'i': {
7177    // Literal immediates are always ok.
7178    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
7179      Result = DAG.getTargetConstant(CST->getZExtValue(), Op.getValueType());
7180      break;
7181    }
7182
7183    // If we are in non-pic codegen mode, we allow the address of a global (with
7184    // an optional displacement) to be used with 'i'.
7185    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
7186    int64_t Offset = 0;
7187
7188    // Match either (GA) or (GA+C)
7189    if (GA) {
7190      Offset = GA->getOffset();
7191    } else if (Op.getOpcode() == ISD::ADD) {
7192      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
7193      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
7194      if (C && GA) {
7195        Offset = GA->getOffset()+C->getZExtValue();
7196      } else {
7197        C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
7198        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
7199        if (C && GA)
7200          Offset = GA->getOffset()+C->getZExtValue();
7201        else
7202          C = 0, GA = 0;
7203      }
7204    }
7205
7206    if (GA) {
7207      if (hasMemory)
7208        Op = LowerGlobalAddress(GA->getGlobal(), DAG);
7209      else
7210        Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
7211                                        Offset);
7212      Result = Op;
7213      break;
7214    }
7215
7216    // Otherwise, not valid for this mode.
7217    return;
7218  }
7219  }
7220
7221  if (Result.getNode()) {
7222    Ops.push_back(Result);
7223    return;
7224  }
7225  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
7226                                                      Ops, DAG);
7227}
7228
7229std::vector<unsigned> X86TargetLowering::
7230getRegClassForInlineAsmConstraint(const std::string &Constraint,
7231                                  MVT VT) const {
7232  if (Constraint.size() == 1) {
7233    // FIXME: not handling fp-stack yet!
7234    switch (Constraint[0]) {      // GCC X86 Constraint Letters
7235    default: break;  // Unknown constraint letter
7236    case 'A':   // EAX/EDX
7237      if (VT == MVT::i32 || VT == MVT::i64)
7238        return make_vector<unsigned>(X86::EAX, X86::EDX, 0);
7239      break;
7240    case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
7241    case 'Q':   // Q_REGS
7242      if (VT == MVT::i32)
7243        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
7244      else if (VT == MVT::i16)
7245        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
7246      else if (VT == MVT::i8)
7247        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
7248      else if (VT == MVT::i64)
7249        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
7250      break;
7251    }
7252  }
7253
7254  return std::vector<unsigned>();
7255}
7256
7257std::pair<unsigned, const TargetRegisterClass*>
7258X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
7259                                                MVT VT) const {
7260  // First, see if this is a constraint that directly corresponds to an LLVM
7261  // register class.
7262  if (Constraint.size() == 1) {
7263    // GCC Constraint Letters
7264    switch (Constraint[0]) {
7265    default: break;
7266    case 'r':   // GENERAL_REGS
7267    case 'R':   // LEGACY_REGS
7268    case 'l':   // INDEX_REGS
7269      if (VT == MVT::i64 && Subtarget->is64Bit())
7270        return std::make_pair(0U, X86::GR64RegisterClass);
7271      if (VT == MVT::i32)
7272        return std::make_pair(0U, X86::GR32RegisterClass);
7273      else if (VT == MVT::i16)
7274        return std::make_pair(0U, X86::GR16RegisterClass);
7275      else if (VT == MVT::i8)
7276        return std::make_pair(0U, X86::GR8RegisterClass);
7277      break;
7278    case 'f':  // FP Stack registers.
7279      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
7280      // value to the correct fpstack register class.
7281      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
7282        return std::make_pair(0U, X86::RFP32RegisterClass);
7283      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
7284        return std::make_pair(0U, X86::RFP64RegisterClass);
7285      return std::make_pair(0U, X86::RFP80RegisterClass);
7286    case 'y':   // MMX_REGS if MMX allowed.
7287      if (!Subtarget->hasMMX()) break;
7288      return std::make_pair(0U, X86::VR64RegisterClass);
7289      break;
7290    case 'Y':   // SSE_REGS if SSE2 allowed
7291      if (!Subtarget->hasSSE2()) break;
7292      // FALL THROUGH.
7293    case 'x':   // SSE_REGS if SSE1 allowed
7294      if (!Subtarget->hasSSE1()) break;
7295
7296      switch (VT.getSimpleVT()) {
7297      default: break;
7298      // Scalar SSE types.
7299      case MVT::f32:
7300      case MVT::i32:
7301        return std::make_pair(0U, X86::FR32RegisterClass);
7302      case MVT::f64:
7303      case MVT::i64:
7304        return std::make_pair(0U, X86::FR64RegisterClass);
7305      // Vector types.
7306      case MVT::v16i8:
7307      case MVT::v8i16:
7308      case MVT::v4i32:
7309      case MVT::v2i64:
7310      case MVT::v4f32:
7311      case MVT::v2f64:
7312        return std::make_pair(0U, X86::VR128RegisterClass);
7313      }
7314      break;
7315    }
7316  }
7317
7318  // Use the default implementation in TargetLowering to convert the register
7319  // constraint into a member of a register class.
7320  std::pair<unsigned, const TargetRegisterClass*> Res;
7321  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
7322
7323  // Not found as a standard register?
7324  if (Res.second == 0) {
7325    // GCC calls "st(0)" just plain "st".
7326    if (StringsEqualNoCase("{st}", Constraint)) {
7327      Res.first = X86::ST0;
7328      Res.second = X86::RFP80RegisterClass;
7329    }
7330
7331    return Res;
7332  }
7333
7334  // Otherwise, check to see if this is a register class of the wrong value
7335  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
7336  // turn into {ax},{dx}.
7337  if (Res.second->hasType(VT))
7338    return Res;   // Correct type already, nothing to do.
7339
7340  // All of the single-register GCC register classes map their values onto
7341  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
7342  // really want an 8-bit or 32-bit register, map to the appropriate register
7343  // class and return the appropriate register.
7344  if (Res.second == X86::GR16RegisterClass) {
7345    if (VT == MVT::i8) {
7346      unsigned DestReg = 0;
7347      switch (Res.first) {
7348      default: break;
7349      case X86::AX: DestReg = X86::AL; break;
7350      case X86::DX: DestReg = X86::DL; break;
7351      case X86::CX: DestReg = X86::CL; break;
7352      case X86::BX: DestReg = X86::BL; break;
7353      }
7354      if (DestReg) {
7355        Res.first = DestReg;
7356        Res.second = Res.second = X86::GR8RegisterClass;
7357      }
7358    } else if (VT == MVT::i32) {
7359      unsigned DestReg = 0;
7360      switch (Res.first) {
7361      default: break;
7362      case X86::AX: DestReg = X86::EAX; break;
7363      case X86::DX: DestReg = X86::EDX; break;
7364      case X86::CX: DestReg = X86::ECX; break;
7365      case X86::BX: DestReg = X86::EBX; break;
7366      case X86::SI: DestReg = X86::ESI; break;
7367      case X86::DI: DestReg = X86::EDI; break;
7368      case X86::BP: DestReg = X86::EBP; break;
7369      case X86::SP: DestReg = X86::ESP; break;
7370      }
7371      if (DestReg) {
7372        Res.first = DestReg;
7373        Res.second = Res.second = X86::GR32RegisterClass;
7374      }
7375    } else if (VT == MVT::i64) {
7376      unsigned DestReg = 0;
7377      switch (Res.first) {
7378      default: break;
7379      case X86::AX: DestReg = X86::RAX; break;
7380      case X86::DX: DestReg = X86::RDX; break;
7381      case X86::CX: DestReg = X86::RCX; break;
7382      case X86::BX: DestReg = X86::RBX; break;
7383      case X86::SI: DestReg = X86::RSI; break;
7384      case X86::DI: DestReg = X86::RDI; break;
7385      case X86::BP: DestReg = X86::RBP; break;
7386      case X86::SP: DestReg = X86::RSP; break;
7387      }
7388      if (DestReg) {
7389        Res.first = DestReg;
7390        Res.second = Res.second = X86::GR64RegisterClass;
7391      }
7392    }
7393  } else if (Res.second == X86::FR32RegisterClass ||
7394             Res.second == X86::FR64RegisterClass ||
7395             Res.second == X86::VR128RegisterClass) {
7396    // Handle references to XMM physical registers that got mapped into the
7397    // wrong class.  This can happen with constraints like {xmm0} where the
7398    // target independent register mapper will just pick the first match it can
7399    // find, ignoring the required type.
7400    if (VT == MVT::f32)
7401      Res.second = X86::FR32RegisterClass;
7402    else if (VT == MVT::f64)
7403      Res.second = X86::FR64RegisterClass;
7404    else if (X86::VR128RegisterClass->hasType(VT))
7405      Res.second = X86::VR128RegisterClass;
7406  }
7407
7408  return Res;
7409}
7410