X86ISelLowering.cpp revision d6662add687f20cffa0755e410efbb40de4dcf23
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86.h"
16#include "X86InstrBuilder.h"
17#include "X86ISelLowering.h"
18#include "X86TargetMachine.h"
19#include "llvm/CallingConv.h"
20#include "llvm/Constants.h"
21#include "llvm/DerivedTypes.h"
22#include "llvm/GlobalAlias.h"
23#include "llvm/GlobalVariable.h"
24#include "llvm/Function.h"
25#include "llvm/Instructions.h"
26#include "llvm/Intrinsics.h"
27#include "llvm/LLVMContext.h"
28#include "llvm/ADT/BitVector.h"
29#include "llvm/ADT/VectorExtras.h"
30#include "llvm/CodeGen/MachineFrameInfo.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineModuleInfo.h"
34#include "llvm/CodeGen/MachineRegisterInfo.h"
35#include "llvm/CodeGen/PseudoSourceValue.h"
36#include "llvm/Support/MathExtras.h"
37#include "llvm/Support/Debug.h"
38#include "llvm/Support/ErrorHandling.h"
39#include "llvm/Target/TargetLoweringObjectFile.h"
40#include "llvm/Target/TargetOptions.h"
41#include "llvm/ADT/SmallSet.h"
42#include "llvm/ADT/StringExtras.h"
43#include "llvm/Support/CommandLine.h"
44#include "llvm/Support/raw_ostream.h"
45using namespace llvm;
46
47static cl::opt<bool>
48DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
49
50// Forward declarations.
51static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
52                       SDValue V2);
53
54static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
55  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
56  default: llvm_unreachable("unknown subtarget type");
57  case X86Subtarget::isDarwin:
58    return new TargetLoweringObjectFileMachO();
59  case X86Subtarget::isELF:
60    return new TargetLoweringObjectFileELF();
61  case X86Subtarget::isMingw:
62  case X86Subtarget::isCygwin:
63  case X86Subtarget::isWindows:
64    return new TargetLoweringObjectFileCOFF();
65  }
66
67}
68
69X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
70  : TargetLowering(TM, createTLOF(TM)) {
71  Subtarget = &TM.getSubtarget<X86Subtarget>();
72  X86ScalarSSEf64 = Subtarget->hasSSE2();
73  X86ScalarSSEf32 = Subtarget->hasSSE1();
74  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
75
76  RegInfo = TM.getRegisterInfo();
77  TD = getTargetData();
78
79  // Set up the TargetLowering object.
80
81  // X86 is weird, it always uses i8 for shift amounts and setcc results.
82  setShiftAmountType(MVT::i8);
83  setBooleanContents(ZeroOrOneBooleanContent);
84  setSchedulingPreference(SchedulingForRegPressure);
85  setStackPointerRegisterToSaveRestore(X86StackPtr);
86
87  if (Subtarget->isTargetDarwin()) {
88    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
89    setUseUnderscoreSetJmp(false);
90    setUseUnderscoreLongJmp(false);
91  } else if (Subtarget->isTargetMingw()) {
92    // MS runtime is weird: it exports _setjmp, but longjmp!
93    setUseUnderscoreSetJmp(true);
94    setUseUnderscoreLongJmp(false);
95  } else {
96    setUseUnderscoreSetJmp(true);
97    setUseUnderscoreLongJmp(true);
98  }
99
100  // Set up the register classes.
101  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
102  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
103  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
104  if (Subtarget->is64Bit())
105    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
106
107  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
108
109  // We don't accept any truncstore of integer registers.
110  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
111  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
112  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
113  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
114  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
115  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
116
117  // SETOEQ and SETUNE require checking two conditions.
118  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
119  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
120  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
121  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
122  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
123  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
124
125  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
126  // operation.
127  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
128  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
129  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
130
131  if (Subtarget->is64Bit()) {
132    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
133    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
134  } else if (!UseSoftFloat) {
135    if (X86ScalarSSEf64) {
136      // We have an impenetrably clever algorithm for ui64->double only.
137      setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
138    }
139    // We have an algorithm for SSE2, and we turn this into a 64-bit
140    // FILD for other targets.
141    setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
142  }
143
144  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
145  // this operation.
146  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
147  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
148
149  if (!UseSoftFloat) {
150    // SSE has no i16 to fp conversion, only i32
151    if (X86ScalarSSEf32) {
152      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
153      // f32 and f64 cases are Legal, f80 case is not
154      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
155    } else {
156      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
157      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
158    }
159  } else {
160    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
161    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
162  }
163
164  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
165  // are Legal, f80 is custom lowered.
166  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
167  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
168
169  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
170  // this operation.
171  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
172  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
173
174  if (X86ScalarSSEf32) {
175    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
176    // f32 and f64 cases are Legal, f80 case is not
177    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
178  } else {
179    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
180    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
181  }
182
183  // Handle FP_TO_UINT by promoting the destination to a larger signed
184  // conversion.
185  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
186  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
187  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
188
189  if (Subtarget->is64Bit()) {
190    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
191    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
192  } else if (!UseSoftFloat) {
193    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
194      // Expand FP_TO_UINT into a select.
195      // FIXME: We would like to use a Custom expander here eventually to do
196      // the optimal thing for SSE vs. the default expansion in the legalizer.
197      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
198    else
199      // With SSE3 we can use fisttpll to convert to a signed i64; without
200      // SSE, we're stuck with a fistpll.
201      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
202  }
203
204  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
205  if (!X86ScalarSSEf64) {
206    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
207    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
208  }
209
210  // Scalar integer divide and remainder are lowered to use operations that
211  // produce two results, to match the available instructions. This exposes
212  // the two-result form to trivial CSE, which is able to combine x/y and x%y
213  // into a single instruction.
214  //
215  // Scalar integer multiply-high is also lowered to use two-result
216  // operations, to match the available instructions. However, plain multiply
217  // (low) operations are left as Legal, as there are single-result
218  // instructions for this in x86. Using the two-result multiply instructions
219  // when both high and low results are needed must be arranged by dagcombine.
220  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
221  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
222  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
223  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
224  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
225  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
226  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
227  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
228  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
229  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
230  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
231  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
232  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
233  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
234  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
235  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
236  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
237  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
238  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
239  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
240  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
241  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
242  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
243  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
244
245  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
246  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
247  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
248  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
249  if (Subtarget->is64Bit())
250    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
251  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
252  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
253  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
254  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
255  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
256  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
257  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
258  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
259
260  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
261  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
262  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
263  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
264  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
265  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
266  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
267  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
268  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
269  if (Subtarget->is64Bit()) {
270    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
271    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
272    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
273  }
274
275  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
276  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
277
278  // These should be promoted to a larger select which is supported.
279  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
280  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
281  // X86 wants to expand cmov itself.
282  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
283  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
284  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
285  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
286  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
287  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
288  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
289  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
290  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
291  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
292  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
293  if (Subtarget->is64Bit()) {
294    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
295    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
296  }
297  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
298
299  // Darwin ABI issue.
300  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
301  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
302  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
303  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
304  if (Subtarget->is64Bit())
305    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
306  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
307  if (Subtarget->is64Bit()) {
308    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
309    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
310    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
311    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
312  }
313  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
314  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
315  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
316  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
317  if (Subtarget->is64Bit()) {
318    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
319    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
320    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
321  }
322
323  if (Subtarget->hasSSE1())
324    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
325
326  if (!Subtarget->hasSSE2())
327    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
328
329  // Expand certain atomics
330  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
331  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
332  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
333  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
334
335  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
336  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
337  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
338  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
339
340  if (!Subtarget->is64Bit()) {
341    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
342    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
343    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
344    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
345    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
346    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
347    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
348  }
349
350  // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
351  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
352  // FIXME - use subtarget debug flags
353  if (!Subtarget->isTargetDarwin() &&
354      !Subtarget->isTargetELF() &&
355      !Subtarget->isTargetCygMing()) {
356    setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
357    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
358  }
359
360  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
361  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
362  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
363  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
364  if (Subtarget->is64Bit()) {
365    setExceptionPointerRegister(X86::RAX);
366    setExceptionSelectorRegister(X86::RDX);
367  } else {
368    setExceptionPointerRegister(X86::EAX);
369    setExceptionSelectorRegister(X86::EDX);
370  }
371  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
372  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
373
374  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
375
376  setOperationAction(ISD::TRAP, MVT::Other, Legal);
377
378  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
379  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
380  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
381  if (Subtarget->is64Bit()) {
382    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
383    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
384  } else {
385    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
386    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
387  }
388
389  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
390  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
391  if (Subtarget->is64Bit())
392    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
393  if (Subtarget->isTargetCygMing())
394    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
395  else
396    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
397
398  if (!UseSoftFloat && X86ScalarSSEf64) {
399    // f32 and f64 use SSE.
400    // Set up the FP register classes.
401    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
402    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
403
404    // Use ANDPD to simulate FABS.
405    setOperationAction(ISD::FABS , MVT::f64, Custom);
406    setOperationAction(ISD::FABS , MVT::f32, Custom);
407
408    // Use XORP to simulate FNEG.
409    setOperationAction(ISD::FNEG , MVT::f64, Custom);
410    setOperationAction(ISD::FNEG , MVT::f32, Custom);
411
412    // Use ANDPD and ORPD to simulate FCOPYSIGN.
413    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
414    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
415
416    // We don't support sin/cos/fmod
417    setOperationAction(ISD::FSIN , MVT::f64, Expand);
418    setOperationAction(ISD::FCOS , MVT::f64, Expand);
419    setOperationAction(ISD::FSIN , MVT::f32, Expand);
420    setOperationAction(ISD::FCOS , MVT::f32, Expand);
421
422    // Expand FP immediates into loads from the stack, except for the special
423    // cases we handle.
424    addLegalFPImmediate(APFloat(+0.0)); // xorpd
425    addLegalFPImmediate(APFloat(+0.0f)); // xorps
426  } else if (!UseSoftFloat && X86ScalarSSEf32) {
427    // Use SSE for f32, x87 for f64.
428    // Set up the FP register classes.
429    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
430    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
431
432    // Use ANDPS to simulate FABS.
433    setOperationAction(ISD::FABS , MVT::f32, Custom);
434
435    // Use XORP to simulate FNEG.
436    setOperationAction(ISD::FNEG , MVT::f32, Custom);
437
438    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
439
440    // Use ANDPS and ORPS to simulate FCOPYSIGN.
441    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
442    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
443
444    // We don't support sin/cos/fmod
445    setOperationAction(ISD::FSIN , MVT::f32, Expand);
446    setOperationAction(ISD::FCOS , MVT::f32, Expand);
447
448    // Special cases we handle for FP constants.
449    addLegalFPImmediate(APFloat(+0.0f)); // xorps
450    addLegalFPImmediate(APFloat(+0.0)); // FLD0
451    addLegalFPImmediate(APFloat(+1.0)); // FLD1
452    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
453    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
454
455    if (!UnsafeFPMath) {
456      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
457      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
458    }
459  } else if (!UseSoftFloat) {
460    // f32 and f64 in x87.
461    // Set up the FP register classes.
462    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
463    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
464
465    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
466    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
467    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
468    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
469
470    if (!UnsafeFPMath) {
471      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
472      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
473    }
474    addLegalFPImmediate(APFloat(+0.0)); // FLD0
475    addLegalFPImmediate(APFloat(+1.0)); // FLD1
476    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
477    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
478    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
479    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
480    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
481    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
482  }
483
484  // Long double always uses X87.
485  if (!UseSoftFloat) {
486    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
487    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
488    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
489    {
490      bool ignored;
491      APFloat TmpFlt(+0.0);
492      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
493                     &ignored);
494      addLegalFPImmediate(TmpFlt);  // FLD0
495      TmpFlt.changeSign();
496      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
497      APFloat TmpFlt2(+1.0);
498      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
499                      &ignored);
500      addLegalFPImmediate(TmpFlt2);  // FLD1
501      TmpFlt2.changeSign();
502      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
503    }
504
505    if (!UnsafeFPMath) {
506      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
507      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
508    }
509  }
510
511  // Always use a library call for pow.
512  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
513  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
514  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
515
516  setOperationAction(ISD::FLOG, MVT::f80, Expand);
517  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
518  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
519  setOperationAction(ISD::FEXP, MVT::f80, Expand);
520  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
521
522  // First set operation action for all vector types to either promote
523  // (for widening) or expand (for scalarization). Then we will selectively
524  // turn on ones that can be effectively codegen'd.
525  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
526       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
527    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
528    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
529    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
530    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
531    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
532    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
533    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
534    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
535    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
536    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
537    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
538    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
539    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
540    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
541    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
542    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
543    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
544    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
545    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
546    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
547    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
548    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
549    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
550    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
551    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
552    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
553    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
554    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
555    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
556    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
557    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
558    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
559    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
560    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
561    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
562    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
563    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
564    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
565    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
566    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
567    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
568    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
569    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
570    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
571    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
572    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
573    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
574    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
575  }
576
577  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
578  // with -msoft-float, disable use of MMX as well.
579  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
580    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
581    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
582    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
583    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
584    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
585
586    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
587    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
588    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
589    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
590
591    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
592    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
593    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
594    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
595
596    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
597    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
598
599    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
600    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
601    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
602    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
603    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
604    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
605    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
606
607    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
608    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
609    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
610    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
611    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
612    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
613    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
614
615    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
616    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
617    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
618    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
619    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
620    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
621    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
622
623    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
624    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
625    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
626    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
627    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
628    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
629    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
630    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
631    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
632
633    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
634    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
635    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
636    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
637    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
638
639    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
640    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
641    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
642    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
643
644    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
645    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
646    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
647    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
648
649    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
650
651    setTruncStoreAction(MVT::v8i16,             MVT::v8i8, Expand);
652    setOperationAction(ISD::TRUNCATE,           MVT::v8i8, Expand);
653    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
654    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
655    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
656    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
657    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
658    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
659    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
660  }
661
662  if (!UseSoftFloat && Subtarget->hasSSE1()) {
663    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
664
665    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
666    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
667    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
668    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
669    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
670    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
671    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
672    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
673    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
674    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
675    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
676    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
677  }
678
679  if (!UseSoftFloat && Subtarget->hasSSE2()) {
680    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
681
682    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
683    // registers cannot be used even for integer operations.
684    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
685    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
686    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
687    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
688
689    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
690    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
691    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
692    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
693    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
694    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
695    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
696    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
697    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
698    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
699    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
700    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
701    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
702    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
703    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
704    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
705
706    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
707    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
708    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
709    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
710
711    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
712    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
713    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
714    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
715    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
716
717    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
718    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
719      MVT VT = (MVT::SimpleValueType)i;
720      // Do not attempt to custom lower non-power-of-2 vectors
721      if (!isPowerOf2_32(VT.getVectorNumElements()))
722        continue;
723      // Do not attempt to custom lower non-128-bit vectors
724      if (!VT.is128BitVector())
725        continue;
726      setOperationAction(ISD::BUILD_VECTOR,       VT.getSimpleVT(), Custom);
727      setOperationAction(ISD::VECTOR_SHUFFLE,     VT.getSimpleVT(), Custom);
728      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
729    }
730
731    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
732    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
733    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
734    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
735    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
736    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
737
738    if (Subtarget->is64Bit()) {
739      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
740      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
741    }
742
743    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
744    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
745      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
746      MVT VT = SVT;
747
748      // Do not attempt to promote non-128-bit vectors
749      if (!VT.is128BitVector()) {
750        continue;
751      }
752      setOperationAction(ISD::AND,    SVT, Promote);
753      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
754      setOperationAction(ISD::OR,     SVT, Promote);
755      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
756      setOperationAction(ISD::XOR,    SVT, Promote);
757      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
758      setOperationAction(ISD::LOAD,   SVT, Promote);
759      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
760      setOperationAction(ISD::SELECT, SVT, Promote);
761      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
762    }
763
764    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
765
766    // Custom lower v2i64 and v2f64 selects.
767    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
768    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
769    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
770    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
771
772    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
773    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
774    if (!DisableMMX && Subtarget->hasMMX()) {
775      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
776      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
777    }
778  }
779
780  if (Subtarget->hasSSE41()) {
781    // FIXME: Do we need to handle scalar-to-vector here?
782    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
783
784    // i8 and i16 vectors are custom , because the source register and source
785    // source memory operand types are not the same width.  f32 vectors are
786    // custom since the immediate controlling the insert encodes additional
787    // information.
788    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
789    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
790    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
791    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
792
793    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
794    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
795    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
796    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
797
798    if (Subtarget->is64Bit()) {
799      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
800      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
801    }
802  }
803
804  if (Subtarget->hasSSE42()) {
805    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
806  }
807
808  if (!UseSoftFloat && Subtarget->hasAVX()) {
809    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
810    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
811    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
812    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
813
814    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
815    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
816    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
817    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
818    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
819    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
820    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
821    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
822    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
823    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
824    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
825    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
826    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
827    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
828    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
829
830    // Operations to consider commented out -v16i16 v32i8
831    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
832    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
833    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
834    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
835    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
836    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
837    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
838    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
839    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
840    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
841    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
842    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
843    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
844    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
845
846    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
847    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
848    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
849    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
850
851    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
852    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
853    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
854    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
855    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
856
857    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
858    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
859    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
860    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
861    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
862    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
863
864#if 0
865    // Not sure we want to do this since there are no 256-bit integer
866    // operations in AVX
867
868    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
869    // This includes 256-bit vectors
870    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
871      MVT VT = (MVT::SimpleValueType)i;
872
873      // Do not attempt to custom lower non-power-of-2 vectors
874      if (!isPowerOf2_32(VT.getVectorNumElements()))
875        continue;
876
877      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
878      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
879      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
880    }
881
882    if (Subtarget->is64Bit()) {
883      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
884      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
885    }
886#endif
887
888#if 0
889    // Not sure we want to do this since there are no 256-bit integer
890    // operations in AVX
891
892    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
893    // Including 256-bit vectors
894    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
895      MVT VT = (MVT::SimpleValueType)i;
896
897      if (!VT.is256BitVector()) {
898        continue;
899      }
900      setOperationAction(ISD::AND,    VT, Promote);
901      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
902      setOperationAction(ISD::OR,     VT, Promote);
903      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
904      setOperationAction(ISD::XOR,    VT, Promote);
905      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
906      setOperationAction(ISD::LOAD,   VT, Promote);
907      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
908      setOperationAction(ISD::SELECT, VT, Promote);
909      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
910    }
911
912    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
913#endif
914  }
915
916  // We want to custom lower some of our intrinsics.
917  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
918
919  // Add/Sub/Mul with overflow operations are custom lowered.
920  setOperationAction(ISD::SADDO, MVT::i32, Custom);
921  setOperationAction(ISD::SADDO, MVT::i64, Custom);
922  setOperationAction(ISD::UADDO, MVT::i32, Custom);
923  setOperationAction(ISD::UADDO, MVT::i64, Custom);
924  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
925  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
926  setOperationAction(ISD::USUBO, MVT::i32, Custom);
927  setOperationAction(ISD::USUBO, MVT::i64, Custom);
928  setOperationAction(ISD::SMULO, MVT::i32, Custom);
929  setOperationAction(ISD::SMULO, MVT::i64, Custom);
930
931  if (!Subtarget->is64Bit()) {
932    // These libcalls are not available in 32-bit.
933    setLibcallName(RTLIB::SHL_I128, 0);
934    setLibcallName(RTLIB::SRL_I128, 0);
935    setLibcallName(RTLIB::SRA_I128, 0);
936  }
937
938  // We have target-specific dag combine patterns for the following nodes:
939  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
940  setTargetDAGCombine(ISD::BUILD_VECTOR);
941  setTargetDAGCombine(ISD::SELECT);
942  setTargetDAGCombine(ISD::SHL);
943  setTargetDAGCombine(ISD::SRA);
944  setTargetDAGCombine(ISD::SRL);
945  setTargetDAGCombine(ISD::STORE);
946  setTargetDAGCombine(ISD::MEMBARRIER);
947  if (Subtarget->is64Bit())
948    setTargetDAGCombine(ISD::MUL);
949
950  computeRegisterProperties();
951
952  // FIXME: These should be based on subtarget info. Plus, the values should
953  // be smaller when we are in optimizing for size mode.
954  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
955  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
956  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
957  allowUnalignedMemoryAccesses = true; // x86 supports it!
958  setPrefLoopAlignment(16);
959  benefitFromCodePlacementOpt = true;
960}
961
962
963MVT::SimpleValueType X86TargetLowering::getSetCCResultType(MVT VT) const {
964  return MVT::i8;
965}
966
967
968/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
969/// the desired ByVal argument alignment.
970static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
971  if (MaxAlign == 16)
972    return;
973  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
974    if (VTy->getBitWidth() == 128)
975      MaxAlign = 16;
976  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
977    unsigned EltAlign = 0;
978    getMaxByValAlign(ATy->getElementType(), EltAlign);
979    if (EltAlign > MaxAlign)
980      MaxAlign = EltAlign;
981  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
982    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
983      unsigned EltAlign = 0;
984      getMaxByValAlign(STy->getElementType(i), EltAlign);
985      if (EltAlign > MaxAlign)
986        MaxAlign = EltAlign;
987      if (MaxAlign == 16)
988        break;
989    }
990  }
991  return;
992}
993
994/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
995/// function arguments in the caller parameter area. For X86, aggregates
996/// that contain SSE vectors are placed at 16-byte boundaries while the rest
997/// are at 4-byte boundaries.
998unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
999  if (Subtarget->is64Bit()) {
1000    // Max of 8 and alignment of type.
1001    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1002    if (TyAlign > 8)
1003      return TyAlign;
1004    return 8;
1005  }
1006
1007  unsigned Align = 4;
1008  if (Subtarget->hasSSE1())
1009    getMaxByValAlign(Ty, Align);
1010  return Align;
1011}
1012
1013/// getOptimalMemOpType - Returns the target specific optimal type for load
1014/// and store operations as a result of memset, memcpy, and memmove
1015/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
1016/// determining it.
1017MVT
1018X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
1019                                       bool isSrcConst, bool isSrcStr,
1020                                       SelectionDAG &DAG) const {
1021  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1022  // linux.  This is because the stack realignment code can't handle certain
1023  // cases like PR2962.  This should be removed when PR2962 is fixed.
1024  const Function *F = DAG.getMachineFunction().getFunction();
1025  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
1026  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
1027    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
1028      return MVT::v4i32;
1029    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
1030      return MVT::v4f32;
1031  }
1032  if (Subtarget->is64Bit() && Size >= 8)
1033    return MVT::i64;
1034  return MVT::i32;
1035}
1036
1037/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1038/// jumptable.
1039SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1040                                                      SelectionDAG &DAG) const {
1041  if (usesGlobalOffsetTable())
1042    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
1043  if (!Subtarget->is64Bit())
1044    // This doesn't have DebugLoc associated with it, but is not really the
1045    // same as a Register.
1046    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
1047                       getPointerTy());
1048  return Table;
1049}
1050
1051/// getFunctionAlignment - Return the Log2 alignment of this function.
1052unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1053  return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 4;
1054}
1055
1056//===----------------------------------------------------------------------===//
1057//               Return Value Calling Convention Implementation
1058//===----------------------------------------------------------------------===//
1059
1060#include "X86GenCallingConv.inc"
1061
1062SDValue
1063X86TargetLowering::LowerReturn(SDValue Chain,
1064                               unsigned CallConv, bool isVarArg,
1065                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1066                               DebugLoc dl, SelectionDAG &DAG) {
1067
1068  SmallVector<CCValAssign, 16> RVLocs;
1069  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1070                 RVLocs, *DAG.getContext());
1071  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1072
1073  // If this is the first return lowered for this function, add the regs to the
1074  // liveout set for the function.
1075  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1076    for (unsigned i = 0; i != RVLocs.size(); ++i)
1077      if (RVLocs[i].isRegLoc())
1078        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1079  }
1080
1081  SDValue Flag;
1082
1083  SmallVector<SDValue, 6> RetOps;
1084  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1085  // Operand #1 = Bytes To Pop
1086  RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
1087
1088  // Copy the result values into the output registers.
1089  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1090    CCValAssign &VA = RVLocs[i];
1091    assert(VA.isRegLoc() && "Can only return in registers!");
1092    SDValue ValToCopy = Outs[i].Val;
1093
1094    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1095    // the RET instruction and handled by the FP Stackifier.
1096    if (VA.getLocReg() == X86::ST0 ||
1097        VA.getLocReg() == X86::ST1) {
1098      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1099      // change the value to the FP stack register class.
1100      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1101        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1102      RetOps.push_back(ValToCopy);
1103      // Don't emit a copytoreg.
1104      continue;
1105    }
1106
1107    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1108    // which is returned in RAX / RDX.
1109    if (Subtarget->is64Bit()) {
1110      MVT ValVT = ValToCopy.getValueType();
1111      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1112        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1113        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1114          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
1115      }
1116    }
1117
1118    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1119    Flag = Chain.getValue(1);
1120  }
1121
1122  // The x86-64 ABI for returning structs by value requires that we copy
1123  // the sret argument into %rax for the return. We saved the argument into
1124  // a virtual register in the entry block, so now we copy the value out
1125  // and into %rax.
1126  if (Subtarget->is64Bit() &&
1127      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1128    MachineFunction &MF = DAG.getMachineFunction();
1129    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1130    unsigned Reg = FuncInfo->getSRetReturnReg();
1131    if (!Reg) {
1132      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1133      FuncInfo->setSRetReturnReg(Reg);
1134    }
1135    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1136
1137    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1138    Flag = Chain.getValue(1);
1139  }
1140
1141  RetOps[0] = Chain;  // Update chain.
1142
1143  // Add the flag if we have it.
1144  if (Flag.getNode())
1145    RetOps.push_back(Flag);
1146
1147  return DAG.getNode(X86ISD::RET_FLAG, dl,
1148                     MVT::Other, &RetOps[0], RetOps.size());
1149}
1150
1151/// LowerCallResult - Lower the result values of a call into the
1152/// appropriate copies out of appropriate physical registers.
1153///
1154SDValue
1155X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1156                                   unsigned CallConv, bool isVarArg,
1157                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1158                                   DebugLoc dl, SelectionDAG &DAG,
1159                                   SmallVectorImpl<SDValue> &InVals) {
1160
1161  // Assign locations to each value returned by this call.
1162  SmallVector<CCValAssign, 16> RVLocs;
1163  bool Is64Bit = Subtarget->is64Bit();
1164  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1165                 RVLocs, *DAG.getContext());
1166  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1167
1168  // Copy all of the result registers out of their specified physreg.
1169  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1170    CCValAssign &VA = RVLocs[i];
1171    MVT CopyVT = VA.getValVT();
1172
1173    // If this is x86-64, and we disabled SSE, we can't return FP values
1174    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1175        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1176      llvm_report_error("SSE register return with SSE disabled");
1177    }
1178
1179    // If this is a call to a function that returns an fp value on the floating
1180    // point stack, but where we prefer to use the value in xmm registers, copy
1181    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1182    if ((VA.getLocReg() == X86::ST0 ||
1183         VA.getLocReg() == X86::ST1) &&
1184        isScalarFPTypeInSSEReg(VA.getValVT())) {
1185      CopyVT = MVT::f80;
1186    }
1187
1188    SDValue Val;
1189    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1190      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1191      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1192        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1193                                   MVT::v2i64, InFlag).getValue(1);
1194        Val = Chain.getValue(0);
1195        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1196                          Val, DAG.getConstant(0, MVT::i64));
1197      } else {
1198        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1199                                   MVT::i64, InFlag).getValue(1);
1200        Val = Chain.getValue(0);
1201      }
1202      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1203    } else {
1204      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1205                                 CopyVT, InFlag).getValue(1);
1206      Val = Chain.getValue(0);
1207    }
1208    InFlag = Chain.getValue(2);
1209
1210    if (CopyVT != VA.getValVT()) {
1211      // Round the F80 the right size, which also moves to the appropriate xmm
1212      // register.
1213      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1214                        // This truncation won't change the value.
1215                        DAG.getIntPtrConstant(1));
1216    }
1217
1218    InVals.push_back(Val);
1219  }
1220
1221  return Chain;
1222}
1223
1224
1225//===----------------------------------------------------------------------===//
1226//                C & StdCall & Fast Calling Convention implementation
1227//===----------------------------------------------------------------------===//
1228//  StdCall calling convention seems to be standard for many Windows' API
1229//  routines and around. It differs from C calling convention just a little:
1230//  callee should clean up the stack, not caller. Symbols should be also
1231//  decorated in some fancy way :) It doesn't support any vector arguments.
1232//  For info on fast calling convention see Fast Calling Convention (tail call)
1233//  implementation LowerX86_32FastCCCallTo.
1234
1235/// CallIsStructReturn - Determines whether a call uses struct return
1236/// semantics.
1237static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1238  if (Outs.empty())
1239    return false;
1240
1241  return Outs[0].Flags.isSRet();
1242}
1243
1244/// ArgsAreStructReturn - Determines whether a function uses struct
1245/// return semantics.
1246static bool
1247ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1248  if (Ins.empty())
1249    return false;
1250
1251  return Ins[0].Flags.isSRet();
1252}
1253
1254/// IsCalleePop - Determines whether the callee is required to pop its
1255/// own arguments. Callee pop is necessary to support tail calls.
1256bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
1257  if (IsVarArg)
1258    return false;
1259
1260  switch (CallingConv) {
1261  default:
1262    return false;
1263  case CallingConv::X86_StdCall:
1264    return !Subtarget->is64Bit();
1265  case CallingConv::X86_FastCall:
1266    return !Subtarget->is64Bit();
1267  case CallingConv::Fast:
1268    return PerformTailCallOpt;
1269  }
1270}
1271
1272/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1273/// given CallingConvention value.
1274CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
1275  if (Subtarget->is64Bit()) {
1276    if (Subtarget->isTargetWin64())
1277      return CC_X86_Win64_C;
1278    else
1279      return CC_X86_64_C;
1280  }
1281
1282  if (CC == CallingConv::X86_FastCall)
1283    return CC_X86_32_FastCall;
1284  else if (CC == CallingConv::Fast)
1285    return CC_X86_32_FastCC;
1286  else
1287    return CC_X86_32_C;
1288}
1289
1290/// NameDecorationForCallConv - Selects the appropriate decoration to
1291/// apply to a MachineFunction containing a given calling convention.
1292NameDecorationStyle
1293X86TargetLowering::NameDecorationForCallConv(unsigned CallConv) {
1294  if (CallConv == CallingConv::X86_FastCall)
1295    return FastCall;
1296  else if (CallConv == CallingConv::X86_StdCall)
1297    return StdCall;
1298  return None;
1299}
1300
1301
1302/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1303/// by "Src" to address "Dst" with size and alignment information specified by
1304/// the specific parameter attribute. The copy will be passed as a byval
1305/// function parameter.
1306static SDValue
1307CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1308                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1309                          DebugLoc dl) {
1310  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1311  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1312                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
1313}
1314
1315SDValue
1316X86TargetLowering::LowerMemArgument(SDValue Chain,
1317                                    unsigned CallConv,
1318                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1319                                    DebugLoc dl, SelectionDAG &DAG,
1320                                    const CCValAssign &VA,
1321                                    MachineFrameInfo *MFI,
1322                                    unsigned i) {
1323
1324  // Create the nodes corresponding to a load from this parameter slot.
1325  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1326  bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt;
1327  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1328
1329  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1330  // changed with more analysis.
1331  // In case of tail call optimization mark all arguments mutable. Since they
1332  // could be overwritten by lowering of arguments in case of a tail call.
1333  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
1334                                  VA.getLocMemOffset(), isImmutable);
1335  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1336  if (Flags.isByVal())
1337    return FIN;
1338  return DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
1339                     PseudoSourceValue::getFixedStack(FI), 0);
1340}
1341
1342SDValue
1343X86TargetLowering::LowerFormalArguments(SDValue Chain,
1344                                        unsigned CallConv,
1345                                        bool isVarArg,
1346                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1347                                        DebugLoc dl,
1348                                        SelectionDAG &DAG,
1349                                        SmallVectorImpl<SDValue> &InVals) {
1350
1351  MachineFunction &MF = DAG.getMachineFunction();
1352  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1353
1354  const Function* Fn = MF.getFunction();
1355  if (Fn->hasExternalLinkage() &&
1356      Subtarget->isTargetCygMing() &&
1357      Fn->getName() == "main")
1358    FuncInfo->setForceFramePointer(true);
1359
1360  // Decorate the function name.
1361  FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv));
1362
1363  MachineFrameInfo *MFI = MF.getFrameInfo();
1364  bool Is64Bit = Subtarget->is64Bit();
1365  bool IsWin64 = Subtarget->isTargetWin64();
1366
1367  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1368         "Var args not supported with calling convention fastcc");
1369
1370  // Assign locations to all of the incoming arguments.
1371  SmallVector<CCValAssign, 16> ArgLocs;
1372  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1373                 ArgLocs, *DAG.getContext());
1374  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1375
1376  unsigned LastVal = ~0U;
1377  SDValue ArgValue;
1378  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1379    CCValAssign &VA = ArgLocs[i];
1380    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1381    // places.
1382    assert(VA.getValNo() != LastVal &&
1383           "Don't support value assigned to multiple locs yet");
1384    LastVal = VA.getValNo();
1385
1386    if (VA.isRegLoc()) {
1387      MVT RegVT = VA.getLocVT();
1388      TargetRegisterClass *RC = NULL;
1389      if (RegVT == MVT::i32)
1390        RC = X86::GR32RegisterClass;
1391      else if (Is64Bit && RegVT == MVT::i64)
1392        RC = X86::GR64RegisterClass;
1393      else if (RegVT == MVT::f32)
1394        RC = X86::FR32RegisterClass;
1395      else if (RegVT == MVT::f64)
1396        RC = X86::FR64RegisterClass;
1397      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1398        RC = X86::VR128RegisterClass;
1399      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1400        RC = X86::VR64RegisterClass;
1401      else
1402        llvm_unreachable("Unknown argument type!");
1403
1404      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1405      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1406
1407      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1408      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1409      // right size.
1410      if (VA.getLocInfo() == CCValAssign::SExt)
1411        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1412                               DAG.getValueType(VA.getValVT()));
1413      else if (VA.getLocInfo() == CCValAssign::ZExt)
1414        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1415                               DAG.getValueType(VA.getValVT()));
1416      else if (VA.getLocInfo() == CCValAssign::BCvt)
1417        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1418
1419      if (VA.isExtInLoc()) {
1420        // Handle MMX values passed in XMM regs.
1421        if (RegVT.isVector()) {
1422          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1423                                 ArgValue, DAG.getConstant(0, MVT::i64));
1424          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1425        } else
1426          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1427      }
1428    } else {
1429      assert(VA.isMemLoc());
1430      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1431    }
1432
1433    // If value is passed via pointer - do a load.
1434    if (VA.getLocInfo() == CCValAssign::Indirect)
1435      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0);
1436
1437    InVals.push_back(ArgValue);
1438  }
1439
1440  // The x86-64 ABI for returning structs by value requires that we copy
1441  // the sret argument into %rax for the return. Save the argument into
1442  // a virtual register so that we can access it from the return points.
1443  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1444    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1445    unsigned Reg = FuncInfo->getSRetReturnReg();
1446    if (!Reg) {
1447      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1448      FuncInfo->setSRetReturnReg(Reg);
1449    }
1450    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1451    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1452  }
1453
1454  unsigned StackSize = CCInfo.getNextStackOffset();
1455  // align stack specially for tail calls
1456  if (PerformTailCallOpt && CallConv == CallingConv::Fast)
1457    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1458
1459  // If the function takes variable number of arguments, make a frame index for
1460  // the start of the first vararg value... for expansion of llvm.va_start.
1461  if (isVarArg) {
1462    if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
1463      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
1464    }
1465    if (Is64Bit) {
1466      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1467
1468      // FIXME: We should really autogenerate these arrays
1469      static const unsigned GPR64ArgRegsWin64[] = {
1470        X86::RCX, X86::RDX, X86::R8,  X86::R9
1471      };
1472      static const unsigned XMMArgRegsWin64[] = {
1473        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1474      };
1475      static const unsigned GPR64ArgRegs64Bit[] = {
1476        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1477      };
1478      static const unsigned XMMArgRegs64Bit[] = {
1479        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1480        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1481      };
1482      const unsigned *GPR64ArgRegs, *XMMArgRegs;
1483
1484      if (IsWin64) {
1485        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1486        GPR64ArgRegs = GPR64ArgRegsWin64;
1487        XMMArgRegs = XMMArgRegsWin64;
1488      } else {
1489        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1490        GPR64ArgRegs = GPR64ArgRegs64Bit;
1491        XMMArgRegs = XMMArgRegs64Bit;
1492      }
1493      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1494                                                       TotalNumIntRegs);
1495      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1496                                                       TotalNumXMMRegs);
1497
1498      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1499      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1500             "SSE register cannot be used when SSE is disabled!");
1501      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1502             "SSE register cannot be used when SSE is disabled!");
1503      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1504        // Kernel mode asks for SSE to be disabled, so don't push them
1505        // on the stack.
1506        TotalNumXMMRegs = 0;
1507
1508      // For X86-64, if there are vararg parameters that are passed via
1509      // registers, then we must store them to their spots on the stack so they
1510      // may be loaded by deferencing the result of va_next.
1511      VarArgsGPOffset = NumIntRegs * 8;
1512      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1513      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1514                                                 TotalNumXMMRegs * 16, 16);
1515
1516      // Store the integer parameter registers.
1517      SmallVector<SDValue, 8> MemOps;
1518      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1519      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1520                                  DAG.getIntPtrConstant(VarArgsGPOffset));
1521      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1522        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1523                                     X86::GR64RegisterClass);
1524        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1525        SDValue Store =
1526          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1527                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1528        MemOps.push_back(Store);
1529        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
1530                          DAG.getIntPtrConstant(8));
1531      }
1532
1533      // Now store the XMM (fp + vector) parameter registers.
1534      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1535                        DAG.getIntPtrConstant(VarArgsFPOffset));
1536      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1537        unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1538                                     X86::VR128RegisterClass);
1539        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1540        SDValue Store =
1541          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1542                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1543        MemOps.push_back(Store);
1544        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
1545                          DAG.getIntPtrConstant(16));
1546      }
1547      if (!MemOps.empty())
1548          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1549                             &MemOps[0], MemOps.size());
1550    }
1551  }
1552
1553  // Some CCs need callee pop.
1554  if (IsCalleePop(isVarArg, CallConv)) {
1555    BytesToPopOnReturn  = StackSize; // Callee pops everything.
1556    BytesCallerReserves = 0;
1557  } else {
1558    BytesToPopOnReturn  = 0; // Callee pops nothing.
1559    // If this is an sret function, the return should pop the hidden pointer.
1560    if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins))
1561      BytesToPopOnReturn = 4;
1562    BytesCallerReserves = StackSize;
1563  }
1564
1565  if (!Is64Bit) {
1566    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
1567    if (CallConv == CallingConv::X86_FastCall)
1568      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
1569  }
1570
1571  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1572
1573  return Chain;
1574}
1575
1576SDValue
1577X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1578                                    SDValue StackPtr, SDValue Arg,
1579                                    DebugLoc dl, SelectionDAG &DAG,
1580                                    const CCValAssign &VA,
1581                                    ISD::ArgFlagsTy Flags) {
1582  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1583  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1584  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1585  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1586  if (Flags.isByVal()) {
1587    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1588  }
1589  return DAG.getStore(Chain, dl, Arg, PtrOff,
1590                      PseudoSourceValue::getStack(), LocMemOffset);
1591}
1592
1593/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1594/// optimization is performed and it is required.
1595SDValue
1596X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1597                                           SDValue &OutRetAddr,
1598                                           SDValue Chain,
1599                                           bool IsTailCall,
1600                                           bool Is64Bit,
1601                                           int FPDiff,
1602                                           DebugLoc dl) {
1603  if (!IsTailCall || FPDiff==0) return Chain;
1604
1605  // Adjust the Return address stack slot.
1606  MVT VT = getPointerTy();
1607  OutRetAddr = getReturnAddressFrameIndex(DAG);
1608
1609  // Load the "old" Return address.
1610  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
1611  return SDValue(OutRetAddr.getNode(), 1);
1612}
1613
1614/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1615/// optimization is performed and it is required (FPDiff!=0).
1616static SDValue
1617EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1618                         SDValue Chain, SDValue RetAddrFrIdx,
1619                         bool Is64Bit, int FPDiff, DebugLoc dl) {
1620  // Store the return address to the appropriate stack slot.
1621  if (!FPDiff) return Chain;
1622  // Calculate the new stack slot for the return address.
1623  int SlotSize = Is64Bit ? 8 : 4;
1624  int NewReturnAddrFI =
1625    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
1626  MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1627  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1628  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1629                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
1630  return Chain;
1631}
1632
1633SDValue
1634X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1635                             unsigned CallConv, bool isVarArg, bool isTailCall,
1636                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1637                             const SmallVectorImpl<ISD::InputArg> &Ins,
1638                             DebugLoc dl, SelectionDAG &DAG,
1639                             SmallVectorImpl<SDValue> &InVals) {
1640
1641  MachineFunction &MF = DAG.getMachineFunction();
1642  bool Is64Bit        = Subtarget->is64Bit();
1643  bool IsStructRet    = CallIsStructReturn(Outs);
1644
1645  assert((!isTailCall ||
1646          (CallConv == CallingConv::Fast && PerformTailCallOpt)) &&
1647         "IsEligibleForTailCallOptimization missed a case!");
1648  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1649         "Var args not supported with calling convention fastcc");
1650
1651  // Analyze operands of the call, assigning locations to each operand.
1652  SmallVector<CCValAssign, 16> ArgLocs;
1653  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1654                 ArgLocs, *DAG.getContext());
1655  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1656
1657  // Get a count of how many bytes are to be pushed on the stack.
1658  unsigned NumBytes = CCInfo.getNextStackOffset();
1659  if (PerformTailCallOpt && CallConv == CallingConv::Fast)
1660    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1661
1662  int FPDiff = 0;
1663  if (isTailCall) {
1664    // Lower arguments at fp - stackoffset + fpdiff.
1665    unsigned NumBytesCallerPushed =
1666      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1667    FPDiff = NumBytesCallerPushed - NumBytes;
1668
1669    // Set the delta of movement of the returnaddr stackslot.
1670    // But only set if delta is greater than previous delta.
1671    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1672      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1673  }
1674
1675  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1676
1677  SDValue RetAddrFrIdx;
1678  // Load return adress for tail calls.
1679  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit,
1680                                  FPDiff, dl);
1681
1682  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1683  SmallVector<SDValue, 8> MemOpChains;
1684  SDValue StackPtr;
1685
1686  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1687  // of tail call optimization arguments are handle later.
1688  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1689    CCValAssign &VA = ArgLocs[i];
1690    MVT RegVT = VA.getLocVT();
1691    SDValue Arg = Outs[i].Val;
1692    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1693    bool isByVal = Flags.isByVal();
1694
1695    // Promote the value if needed.
1696    switch (VA.getLocInfo()) {
1697    default: llvm_unreachable("Unknown loc info!");
1698    case CCValAssign::Full: break;
1699    case CCValAssign::SExt:
1700      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1701      break;
1702    case CCValAssign::ZExt:
1703      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1704      break;
1705    case CCValAssign::AExt:
1706      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1707        // Special case: passing MMX values in XMM registers.
1708        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1709        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1710        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1711      } else
1712        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1713      break;
1714    case CCValAssign::BCvt:
1715      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1716      break;
1717    case CCValAssign::Indirect: {
1718      // Store the argument.
1719      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1720      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1721      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1722                           PseudoSourceValue::getFixedStack(FI), 0);
1723      Arg = SpillSlot;
1724      break;
1725    }
1726    }
1727
1728    if (VA.isRegLoc()) {
1729      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1730    } else {
1731      if (!isTailCall || (isTailCall && isByVal)) {
1732        assert(VA.isMemLoc());
1733        if (StackPtr.getNode() == 0)
1734          StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1735
1736        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1737                                               dl, DAG, VA, Flags));
1738      }
1739    }
1740  }
1741
1742  if (!MemOpChains.empty())
1743    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1744                        &MemOpChains[0], MemOpChains.size());
1745
1746  // Build a sequence of copy-to-reg nodes chained together with token chain
1747  // and flag operands which copy the outgoing args into registers.
1748  SDValue InFlag;
1749  // Tail call byval lowering might overwrite argument registers so in case of
1750  // tail call optimization the copies to registers are lowered later.
1751  if (!isTailCall)
1752    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1753      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1754                               RegsToPass[i].second, InFlag);
1755      InFlag = Chain.getValue(1);
1756    }
1757
1758
1759  if (Subtarget->isPICStyleGOT()) {
1760    // ELF / PIC requires GOT in the EBX register before function calls via PLT
1761    // GOT pointer.
1762    if (!isTailCall) {
1763      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1764                               DAG.getNode(X86ISD::GlobalBaseReg,
1765                                           DebugLoc::getUnknownLoc(),
1766                                           getPointerTy()),
1767                               InFlag);
1768      InFlag = Chain.getValue(1);
1769    } else {
1770      // If we are tail calling and generating PIC/GOT style code load the
1771      // address of the callee into ECX. The value in ecx is used as target of
1772      // the tail jump. This is done to circumvent the ebx/callee-saved problem
1773      // for tail calls on PIC/GOT architectures. Normally we would just put the
1774      // address of GOT into ebx and then call target@PLT. But for tail calls
1775      // ebx would be restored (since ebx is callee saved) before jumping to the
1776      // target@PLT.
1777
1778      // Note: The actual moving to ECX is done further down.
1779      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1780      if (G && !G->getGlobal()->hasHiddenVisibility() &&
1781          !G->getGlobal()->hasProtectedVisibility())
1782        Callee = LowerGlobalAddress(Callee, DAG);
1783      else if (isa<ExternalSymbolSDNode>(Callee))
1784        Callee = LowerExternalSymbol(Callee, DAG);
1785    }
1786  }
1787
1788  if (Is64Bit && isVarArg) {
1789    // From AMD64 ABI document:
1790    // For calls that may call functions that use varargs or stdargs
1791    // (prototype-less calls or calls to functions containing ellipsis (...) in
1792    // the declaration) %al is used as hidden argument to specify the number
1793    // of SSE registers used. The contents of %al do not need to match exactly
1794    // the number of registers, but must be an ubound on the number of SSE
1795    // registers used and is in the range 0 - 8 inclusive.
1796
1797    // FIXME: Verify this on Win64
1798    // Count the number of XMM registers allocated.
1799    static const unsigned XMMArgRegs[] = {
1800      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1801      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1802    };
1803    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1804    assert((Subtarget->hasSSE1() || !NumXMMRegs)
1805           && "SSE registers cannot be used when SSE is disabled");
1806
1807    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1808                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1809    InFlag = Chain.getValue(1);
1810  }
1811
1812
1813  // For tail calls lower the arguments to the 'real' stack slot.
1814  if (isTailCall) {
1815    // Force all the incoming stack arguments to be loaded from the stack
1816    // before any new outgoing arguments are stored to the stack, because the
1817    // outgoing stack slots may alias the incoming argument stack slots, and
1818    // the alias isn't otherwise explicit. This is slightly more conservative
1819    // than necessary, because it means that each store effectively depends
1820    // on every argument instead of just those arguments it would clobber.
1821    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
1822
1823    SmallVector<SDValue, 8> MemOpChains2;
1824    SDValue FIN;
1825    int FI = 0;
1826    // Do not flag preceeding copytoreg stuff together with the following stuff.
1827    InFlag = SDValue();
1828    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1829      CCValAssign &VA = ArgLocs[i];
1830      if (!VA.isRegLoc()) {
1831        assert(VA.isMemLoc());
1832        SDValue Arg = Outs[i].Val;
1833        ISD::ArgFlagsTy Flags = Outs[i].Flags;
1834        // Create frame index.
1835        int32_t Offset = VA.getLocMemOffset()+FPDiff;
1836        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
1837        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1838        FIN = DAG.getFrameIndex(FI, getPointerTy());
1839
1840        if (Flags.isByVal()) {
1841          // Copy relative to framepointer.
1842          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
1843          if (StackPtr.getNode() == 0)
1844            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
1845                                          getPointerTy());
1846          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
1847
1848          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
1849                                                           ArgChain,
1850                                                           Flags, DAG, dl));
1851        } else {
1852          // Store relative to framepointer.
1853          MemOpChains2.push_back(
1854            DAG.getStore(ArgChain, dl, Arg, FIN,
1855                         PseudoSourceValue::getFixedStack(FI), 0));
1856        }
1857      }
1858    }
1859
1860    if (!MemOpChains2.empty())
1861      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1862                          &MemOpChains2[0], MemOpChains2.size());
1863
1864    // Copy arguments to their registers.
1865    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1866      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1867                               RegsToPass[i].second, InFlag);
1868      InFlag = Chain.getValue(1);
1869    }
1870    InFlag =SDValue();
1871
1872    // Store the return address to the appropriate stack slot.
1873    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
1874                                     FPDiff, dl);
1875  }
1876
1877  // If the callee is a GlobalAddress node (quite common, every direct call is)
1878  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
1879  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1880    // We should use extra load for direct calls to dllimported functions in
1881    // non-JIT mode.
1882    GlobalValue *GV = G->getGlobal();
1883    if (!GV->hasDLLImportLinkage()) {
1884      unsigned char OpFlags = 0;
1885
1886      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
1887      // external symbols most go through the PLT in PIC mode.  If the symbol
1888      // has hidden or protected visibility, or if it is static or local, then
1889      // we don't need to use the PLT - we can directly call it.
1890      if (Subtarget->isTargetELF() &&
1891          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1892          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
1893        OpFlags = X86II::MO_PLT;
1894      } else if (Subtarget->isPICStyleStubAny() &&
1895               (GV->isDeclaration() || GV->isWeakForLinker()) &&
1896               Subtarget->getDarwinVers() < 9) {
1897        // PC-relative references to external symbols should go through $stub,
1898        // unless we're building with the leopard linker or later, which
1899        // automatically synthesizes these stubs.
1900        OpFlags = X86II::MO_DARWIN_STUB;
1901      }
1902
1903      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
1904                                          G->getOffset(), OpFlags);
1905    }
1906  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1907    unsigned char OpFlags = 0;
1908
1909    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
1910    // symbols should go through the PLT.
1911    if (Subtarget->isTargetELF() &&
1912        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
1913      OpFlags = X86II::MO_PLT;
1914    } else if (Subtarget->isPICStyleStubAny() &&
1915             Subtarget->getDarwinVers() < 9) {
1916      // PC-relative references to external symbols should go through $stub,
1917      // unless we're building with the leopard linker or later, which
1918      // automatically synthesizes these stubs.
1919      OpFlags = X86II::MO_DARWIN_STUB;
1920    }
1921
1922    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
1923                                         OpFlags);
1924  } else if (isTailCall) {
1925    unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
1926
1927    Chain = DAG.getCopyToReg(Chain,  dl,
1928                             DAG.getRegister(Opc, getPointerTy()),
1929                             Callee,InFlag);
1930    Callee = DAG.getRegister(Opc, getPointerTy());
1931    // Add register as live out.
1932    MF.getRegInfo().addLiveOut(Opc);
1933  }
1934
1935  // Returns a chain & a flag for retval copy to use.
1936  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
1937  SmallVector<SDValue, 8> Ops;
1938
1939  if (isTailCall) {
1940    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1941                           DAG.getIntPtrConstant(0, true), InFlag);
1942    InFlag = Chain.getValue(1);
1943  }
1944
1945  Ops.push_back(Chain);
1946  Ops.push_back(Callee);
1947
1948  if (isTailCall)
1949    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
1950
1951  // Add argument registers to the end of the list so that they are known live
1952  // into the call.
1953  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1954    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1955                                  RegsToPass[i].second.getValueType()));
1956
1957  // Add an implicit use GOT pointer in EBX.
1958  if (!isTailCall && Subtarget->isPICStyleGOT())
1959    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
1960
1961  // Add an implicit use of AL for x86 vararg functions.
1962  if (Is64Bit && isVarArg)
1963    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
1964
1965  if (InFlag.getNode())
1966    Ops.push_back(InFlag);
1967
1968  if (isTailCall) {
1969    // If this is the first return lowered for this function, add the regs
1970    // to the liveout set for the function.
1971    if (MF.getRegInfo().liveout_empty()) {
1972      SmallVector<CCValAssign, 16> RVLocs;
1973      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
1974                     *DAG.getContext());
1975      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1976      for (unsigned i = 0; i != RVLocs.size(); ++i)
1977        if (RVLocs[i].isRegLoc())
1978          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1979    }
1980
1981    assert(((Callee.getOpcode() == ISD::Register &&
1982               (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
1983                cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) ||
1984              Callee.getOpcode() == ISD::TargetExternalSymbol ||
1985              Callee.getOpcode() == ISD::TargetGlobalAddress) &&
1986             "Expecting an global address, external symbol, or register");
1987
1988    return DAG.getNode(X86ISD::TC_RETURN, dl,
1989                       NodeTys, &Ops[0], Ops.size());
1990  }
1991
1992  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
1993  InFlag = Chain.getValue(1);
1994
1995  // Create the CALLSEQ_END node.
1996  unsigned NumBytesForCalleeToPush;
1997  if (IsCalleePop(isVarArg, CallConv))
1998    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
1999  else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
2000    // If this is is a call to a struct-return function, the callee
2001    // pops the hidden struct pointer, so we have to push it back.
2002    // This is common for Darwin/X86, Linux & Mingw32 targets.
2003    NumBytesForCalleeToPush = 4;
2004  else
2005    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2006
2007  // Returns a flag for retval copy to use.
2008  Chain = DAG.getCALLSEQ_END(Chain,
2009                             DAG.getIntPtrConstant(NumBytes, true),
2010                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2011                                                   true),
2012                             InFlag);
2013  InFlag = Chain.getValue(1);
2014
2015  // Handle result values, copying them out of physregs into vregs that we
2016  // return.
2017  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2018                         Ins, dl, DAG, InVals);
2019}
2020
2021
2022//===----------------------------------------------------------------------===//
2023//                Fast Calling Convention (tail call) implementation
2024//===----------------------------------------------------------------------===//
2025
2026//  Like std call, callee cleans arguments, convention except that ECX is
2027//  reserved for storing the tail called function address. Only 2 registers are
2028//  free for argument passing (inreg). Tail call optimization is performed
2029//  provided:
2030//                * tailcallopt is enabled
2031//                * caller/callee are fastcc
2032//  On X86_64 architecture with GOT-style position independent code only local
2033//  (within module) calls are supported at the moment.
2034//  To keep the stack aligned according to platform abi the function
2035//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2036//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2037//  If a tail called function callee has more arguments than the caller the
2038//  caller needs to make sure that there is room to move the RETADDR to. This is
2039//  achieved by reserving an area the size of the argument delta right after the
2040//  original REtADDR, but before the saved framepointer or the spilled registers
2041//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2042//  stack layout:
2043//    arg1
2044//    arg2
2045//    RETADDR
2046//    [ new RETADDR
2047//      move area ]
2048//    (possible EBP)
2049//    ESI
2050//    EDI
2051//    local1 ..
2052
2053/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2054/// for a 16 byte align requirement.
2055unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2056                                                        SelectionDAG& DAG) {
2057  MachineFunction &MF = DAG.getMachineFunction();
2058  const TargetMachine &TM = MF.getTarget();
2059  const TargetFrameInfo &TFI = *TM.getFrameInfo();
2060  unsigned StackAlignment = TFI.getStackAlignment();
2061  uint64_t AlignMask = StackAlignment - 1;
2062  int64_t Offset = StackSize;
2063  uint64_t SlotSize = TD->getPointerSize();
2064  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2065    // Number smaller than 12 so just add the difference.
2066    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2067  } else {
2068    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2069    Offset = ((~AlignMask) & Offset) + StackAlignment +
2070      (StackAlignment-SlotSize);
2071  }
2072  return Offset;
2073}
2074
2075/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2076/// for tail call optimization. Targets which want to do tail call
2077/// optimization should implement this function.
2078bool
2079X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2080                                                     unsigned CalleeCC,
2081                                                     bool isVarArg,
2082                                      const SmallVectorImpl<ISD::InputArg> &Ins,
2083                                                     SelectionDAG& DAG) const {
2084  MachineFunction &MF = DAG.getMachineFunction();
2085  unsigned CallerCC = MF.getFunction()->getCallingConv();
2086  return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC;
2087}
2088
2089FastISel *
2090X86TargetLowering::createFastISel(MachineFunction &mf,
2091                                  MachineModuleInfo *mmo,
2092                                  DwarfWriter *dw,
2093                                  DenseMap<const Value *, unsigned> &vm,
2094                                  DenseMap<const BasicBlock *,
2095                                           MachineBasicBlock *> &bm,
2096                                  DenseMap<const AllocaInst *, int> &am
2097#ifndef NDEBUG
2098                                  , SmallSet<Instruction*, 8> &cil
2099#endif
2100                                  ) {
2101  return X86::createFastISel(mf, mmo, dw, vm, bm, am
2102#ifndef NDEBUG
2103                             , cil
2104#endif
2105                             );
2106}
2107
2108
2109//===----------------------------------------------------------------------===//
2110//                           Other Lowering Hooks
2111//===----------------------------------------------------------------------===//
2112
2113
2114SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
2115  MachineFunction &MF = DAG.getMachineFunction();
2116  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2117  int ReturnAddrIndex = FuncInfo->getRAIndex();
2118
2119  if (ReturnAddrIndex == 0) {
2120    // Set up a frame object for the return address.
2121    uint64_t SlotSize = TD->getPointerSize();
2122    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
2123    FuncInfo->setRAIndex(ReturnAddrIndex);
2124  }
2125
2126  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2127}
2128
2129
2130bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2131                                       bool hasSymbolicDisplacement) {
2132  // Offset should fit into 32 bit immediate field.
2133  if (!isInt32(Offset))
2134    return false;
2135
2136  // If we don't have a symbolic displacement - we don't have any extra
2137  // restrictions.
2138  if (!hasSymbolicDisplacement)
2139    return true;
2140
2141  // FIXME: Some tweaks might be needed for medium code model.
2142  if (M != CodeModel::Small && M != CodeModel::Kernel)
2143    return false;
2144
2145  // For small code model we assume that latest object is 16MB before end of 31
2146  // bits boundary. We may also accept pretty large negative constants knowing
2147  // that all objects are in the positive half of address space.
2148  if (M == CodeModel::Small && Offset < 16*1024*1024)
2149    return true;
2150
2151  // For kernel code model we know that all object resist in the negative half
2152  // of 32bits address space. We may not accept negative offsets, since they may
2153  // be just off and we may accept pretty large positive ones.
2154  if (M == CodeModel::Kernel && Offset > 0)
2155    return true;
2156
2157  return false;
2158}
2159
2160/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2161/// specific condition code, returning the condition code and the LHS/RHS of the
2162/// comparison to make.
2163static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2164                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2165  if (!isFP) {
2166    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2167      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2168        // X > -1   -> X == 0, jump !sign.
2169        RHS = DAG.getConstant(0, RHS.getValueType());
2170        return X86::COND_NS;
2171      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2172        // X < 0   -> X == 0, jump on sign.
2173        return X86::COND_S;
2174      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2175        // X < 1   -> X <= 0
2176        RHS = DAG.getConstant(0, RHS.getValueType());
2177        return X86::COND_LE;
2178      }
2179    }
2180
2181    switch (SetCCOpcode) {
2182    default: llvm_unreachable("Invalid integer condition!");
2183    case ISD::SETEQ:  return X86::COND_E;
2184    case ISD::SETGT:  return X86::COND_G;
2185    case ISD::SETGE:  return X86::COND_GE;
2186    case ISD::SETLT:  return X86::COND_L;
2187    case ISD::SETLE:  return X86::COND_LE;
2188    case ISD::SETNE:  return X86::COND_NE;
2189    case ISD::SETULT: return X86::COND_B;
2190    case ISD::SETUGT: return X86::COND_A;
2191    case ISD::SETULE: return X86::COND_BE;
2192    case ISD::SETUGE: return X86::COND_AE;
2193    }
2194  }
2195
2196  // First determine if it is required or is profitable to flip the operands.
2197
2198  // If LHS is a foldable load, but RHS is not, flip the condition.
2199  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2200      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2201    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2202    std::swap(LHS, RHS);
2203  }
2204
2205  switch (SetCCOpcode) {
2206  default: break;
2207  case ISD::SETOLT:
2208  case ISD::SETOLE:
2209  case ISD::SETUGT:
2210  case ISD::SETUGE:
2211    std::swap(LHS, RHS);
2212    break;
2213  }
2214
2215  // On a floating point condition, the flags are set as follows:
2216  // ZF  PF  CF   op
2217  //  0 | 0 | 0 | X > Y
2218  //  0 | 0 | 1 | X < Y
2219  //  1 | 0 | 0 | X == Y
2220  //  1 | 1 | 1 | unordered
2221  switch (SetCCOpcode) {
2222  default: llvm_unreachable("Condcode should be pre-legalized away");
2223  case ISD::SETUEQ:
2224  case ISD::SETEQ:   return X86::COND_E;
2225  case ISD::SETOLT:              // flipped
2226  case ISD::SETOGT:
2227  case ISD::SETGT:   return X86::COND_A;
2228  case ISD::SETOLE:              // flipped
2229  case ISD::SETOGE:
2230  case ISD::SETGE:   return X86::COND_AE;
2231  case ISD::SETUGT:              // flipped
2232  case ISD::SETULT:
2233  case ISD::SETLT:   return X86::COND_B;
2234  case ISD::SETUGE:              // flipped
2235  case ISD::SETULE:
2236  case ISD::SETLE:   return X86::COND_BE;
2237  case ISD::SETONE:
2238  case ISD::SETNE:   return X86::COND_NE;
2239  case ISD::SETUO:   return X86::COND_P;
2240  case ISD::SETO:    return X86::COND_NP;
2241  }
2242}
2243
2244/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2245/// code. Current x86 isa includes the following FP cmov instructions:
2246/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2247static bool hasFPCMov(unsigned X86CC) {
2248  switch (X86CC) {
2249  default:
2250    return false;
2251  case X86::COND_B:
2252  case X86::COND_BE:
2253  case X86::COND_E:
2254  case X86::COND_P:
2255  case X86::COND_A:
2256  case X86::COND_AE:
2257  case X86::COND_NE:
2258  case X86::COND_NP:
2259    return true;
2260  }
2261}
2262
2263/// isUndefOrInRange - Return true if Val is undef or if its value falls within
2264/// the specified range (L, H].
2265static bool isUndefOrInRange(int Val, int Low, int Hi) {
2266  return (Val < 0) || (Val >= Low && Val < Hi);
2267}
2268
2269/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2270/// specified value.
2271static bool isUndefOrEqual(int Val, int CmpVal) {
2272  if (Val < 0 || Val == CmpVal)
2273    return true;
2274  return false;
2275}
2276
2277/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2278/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
2279/// the second operand.
2280static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) {
2281  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2282    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2283  if (VT == MVT::v2f64 || VT == MVT::v2i64)
2284    return (Mask[0] < 2 && Mask[1] < 2);
2285  return false;
2286}
2287
2288bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2289  SmallVector<int, 8> M;
2290  N->getMask(M);
2291  return ::isPSHUFDMask(M, N->getValueType(0));
2292}
2293
2294/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2295/// is suitable for input to PSHUFHW.
2296static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
2297  if (VT != MVT::v8i16)
2298    return false;
2299
2300  // Lower quadword copied in order or undef.
2301  for (int i = 0; i != 4; ++i)
2302    if (Mask[i] >= 0 && Mask[i] != i)
2303      return false;
2304
2305  // Upper quadword shuffled.
2306  for (int i = 4; i != 8; ++i)
2307    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2308      return false;
2309
2310  return true;
2311}
2312
2313bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2314  SmallVector<int, 8> M;
2315  N->getMask(M);
2316  return ::isPSHUFHWMask(M, N->getValueType(0));
2317}
2318
2319/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2320/// is suitable for input to PSHUFLW.
2321static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
2322  if (VT != MVT::v8i16)
2323    return false;
2324
2325  // Upper quadword copied in order.
2326  for (int i = 4; i != 8; ++i)
2327    if (Mask[i] >= 0 && Mask[i] != i)
2328      return false;
2329
2330  // Lower quadword shuffled.
2331  for (int i = 0; i != 4; ++i)
2332    if (Mask[i] >= 4)
2333      return false;
2334
2335  return true;
2336}
2337
2338bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2339  SmallVector<int, 8> M;
2340  N->getMask(M);
2341  return ::isPSHUFLWMask(M, N->getValueType(0));
2342}
2343
2344/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2345/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2346static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
2347  int NumElems = VT.getVectorNumElements();
2348  if (NumElems != 2 && NumElems != 4)
2349    return false;
2350
2351  int Half = NumElems / 2;
2352  for (int i = 0; i < Half; ++i)
2353    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2354      return false;
2355  for (int i = Half; i < NumElems; ++i)
2356    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2357      return false;
2358
2359  return true;
2360}
2361
2362bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2363  SmallVector<int, 8> M;
2364  N->getMask(M);
2365  return ::isSHUFPMask(M, N->getValueType(0));
2366}
2367
2368/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2369/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2370/// half elements to come from vector 1 (which would equal the dest.) and
2371/// the upper half to come from vector 2.
2372static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
2373  int NumElems = VT.getVectorNumElements();
2374
2375  if (NumElems != 2 && NumElems != 4)
2376    return false;
2377
2378  int Half = NumElems / 2;
2379  for (int i = 0; i < Half; ++i)
2380    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2381      return false;
2382  for (int i = Half; i < NumElems; ++i)
2383    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2384      return false;
2385  return true;
2386}
2387
2388static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2389  SmallVector<int, 8> M;
2390  N->getMask(M);
2391  return isCommutedSHUFPMask(M, N->getValueType(0));
2392}
2393
2394/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2395/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2396bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2397  if (N->getValueType(0).getVectorNumElements() != 4)
2398    return false;
2399
2400  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2401  return isUndefOrEqual(N->getMaskElt(0), 6) &&
2402         isUndefOrEqual(N->getMaskElt(1), 7) &&
2403         isUndefOrEqual(N->getMaskElt(2), 2) &&
2404         isUndefOrEqual(N->getMaskElt(3), 3);
2405}
2406
2407/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2408/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2409bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2410  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2411
2412  if (NumElems != 2 && NumElems != 4)
2413    return false;
2414
2415  for (unsigned i = 0; i < NumElems/2; ++i)
2416    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2417      return false;
2418
2419  for (unsigned i = NumElems/2; i < NumElems; ++i)
2420    if (!isUndefOrEqual(N->getMaskElt(i), i))
2421      return false;
2422
2423  return true;
2424}
2425
2426/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
2427/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
2428/// and MOVLHPS.
2429bool X86::isMOVHPMask(ShuffleVectorSDNode *N) {
2430  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2431
2432  if (NumElems != 2 && NumElems != 4)
2433    return false;
2434
2435  for (unsigned i = 0; i < NumElems/2; ++i)
2436    if (!isUndefOrEqual(N->getMaskElt(i), i))
2437      return false;
2438
2439  for (unsigned i = 0; i < NumElems/2; ++i)
2440    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2441      return false;
2442
2443  return true;
2444}
2445
2446/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2447/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2448/// <2, 3, 2, 3>
2449bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2450  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2451
2452  if (NumElems != 4)
2453    return false;
2454
2455  return isUndefOrEqual(N->getMaskElt(0), 2) &&
2456         isUndefOrEqual(N->getMaskElt(1), 3) &&
2457         isUndefOrEqual(N->getMaskElt(2), 2) &&
2458         isUndefOrEqual(N->getMaskElt(3), 3);
2459}
2460
2461/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2462/// specifies a shuffle of elements that is suitable for input to UNPCKL.
2463static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT,
2464                         bool V2IsSplat = false) {
2465  int NumElts = VT.getVectorNumElements();
2466  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2467    return false;
2468
2469  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2470    int BitI  = Mask[i];
2471    int BitI1 = Mask[i+1];
2472    if (!isUndefOrEqual(BitI, j))
2473      return false;
2474    if (V2IsSplat) {
2475      if (!isUndefOrEqual(BitI1, NumElts))
2476        return false;
2477    } else {
2478      if (!isUndefOrEqual(BitI1, j + NumElts))
2479        return false;
2480    }
2481  }
2482  return true;
2483}
2484
2485bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2486  SmallVector<int, 8> M;
2487  N->getMask(M);
2488  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2489}
2490
2491/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2492/// specifies a shuffle of elements that is suitable for input to UNPCKH.
2493static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT,
2494                         bool V2IsSplat = false) {
2495  int NumElts = VT.getVectorNumElements();
2496  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2497    return false;
2498
2499  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2500    int BitI  = Mask[i];
2501    int BitI1 = Mask[i+1];
2502    if (!isUndefOrEqual(BitI, j + NumElts/2))
2503      return false;
2504    if (V2IsSplat) {
2505      if (isUndefOrEqual(BitI1, NumElts))
2506        return false;
2507    } else {
2508      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2509        return false;
2510    }
2511  }
2512  return true;
2513}
2514
2515bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2516  SmallVector<int, 8> M;
2517  N->getMask(M);
2518  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2519}
2520
2521/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2522/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2523/// <0, 0, 1, 1>
2524static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
2525  int NumElems = VT.getVectorNumElements();
2526  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2527    return false;
2528
2529  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2530    int BitI  = Mask[i];
2531    int BitI1 = Mask[i+1];
2532    if (!isUndefOrEqual(BitI, j))
2533      return false;
2534    if (!isUndefOrEqual(BitI1, j))
2535      return false;
2536  }
2537  return true;
2538}
2539
2540bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2541  SmallVector<int, 8> M;
2542  N->getMask(M);
2543  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2544}
2545
2546/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2547/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2548/// <2, 2, 3, 3>
2549static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
2550  int NumElems = VT.getVectorNumElements();
2551  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2552    return false;
2553
2554  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2555    int BitI  = Mask[i];
2556    int BitI1 = Mask[i+1];
2557    if (!isUndefOrEqual(BitI, j))
2558      return false;
2559    if (!isUndefOrEqual(BitI1, j))
2560      return false;
2561  }
2562  return true;
2563}
2564
2565bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
2566  SmallVector<int, 8> M;
2567  N->getMask(M);
2568  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
2569}
2570
2571/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2572/// specifies a shuffle of elements that is suitable for input to MOVSS,
2573/// MOVSD, and MOVD, i.e. setting the lowest element.
2574static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) {
2575  if (VT.getVectorElementType().getSizeInBits() < 32)
2576    return false;
2577
2578  int NumElts = VT.getVectorNumElements();
2579
2580  if (!isUndefOrEqual(Mask[0], NumElts))
2581    return false;
2582
2583  for (int i = 1; i < NumElts; ++i)
2584    if (!isUndefOrEqual(Mask[i], i))
2585      return false;
2586
2587  return true;
2588}
2589
2590bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
2591  SmallVector<int, 8> M;
2592  N->getMask(M);
2593  return ::isMOVLMask(M, N->getValueType(0));
2594}
2595
2596/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2597/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
2598/// element of vector 2 and the other elements to come from vector 1 in order.
2599static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT,
2600                               bool V2IsSplat = false, bool V2IsUndef = false) {
2601  int NumOps = VT.getVectorNumElements();
2602  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2603    return false;
2604
2605  if (!isUndefOrEqual(Mask[0], 0))
2606    return false;
2607
2608  for (int i = 1; i < NumOps; ++i)
2609    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
2610          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
2611          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
2612      return false;
2613
2614  return true;
2615}
2616
2617static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
2618                           bool V2IsUndef = false) {
2619  SmallVector<int, 8> M;
2620  N->getMask(M);
2621  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
2622}
2623
2624/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2625/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
2626bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
2627  if (N->getValueType(0).getVectorNumElements() != 4)
2628    return false;
2629
2630  // Expect 1, 1, 3, 3
2631  for (unsigned i = 0; i < 2; ++i) {
2632    int Elt = N->getMaskElt(i);
2633    if (Elt >= 0 && Elt != 1)
2634      return false;
2635  }
2636
2637  bool HasHi = false;
2638  for (unsigned i = 2; i < 4; ++i) {
2639    int Elt = N->getMaskElt(i);
2640    if (Elt >= 0 && Elt != 3)
2641      return false;
2642    if (Elt == 3)
2643      HasHi = true;
2644  }
2645  // Don't use movshdup if it can be done with a shufps.
2646  // FIXME: verify that matching u, u, 3, 3 is what we want.
2647  return HasHi;
2648}
2649
2650/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2651/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
2652bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
2653  if (N->getValueType(0).getVectorNumElements() != 4)
2654    return false;
2655
2656  // Expect 0, 0, 2, 2
2657  for (unsigned i = 0; i < 2; ++i)
2658    if (N->getMaskElt(i) > 0)
2659      return false;
2660
2661  bool HasHi = false;
2662  for (unsigned i = 2; i < 4; ++i) {
2663    int Elt = N->getMaskElt(i);
2664    if (Elt >= 0 && Elt != 2)
2665      return false;
2666    if (Elt == 2)
2667      HasHi = true;
2668  }
2669  // Don't use movsldup if it can be done with a shufps.
2670  return HasHi;
2671}
2672
2673/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2674/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
2675bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
2676  int e = N->getValueType(0).getVectorNumElements() / 2;
2677
2678  for (int i = 0; i < e; ++i)
2679    if (!isUndefOrEqual(N->getMaskElt(i), i))
2680      return false;
2681  for (int i = 0; i < e; ++i)
2682    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
2683      return false;
2684  return true;
2685}
2686
2687/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
2688/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
2689/// instructions.
2690unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
2691  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2692  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
2693
2694  unsigned Shift = (NumOperands == 4) ? 2 : 1;
2695  unsigned Mask = 0;
2696  for (int i = 0; i < NumOperands; ++i) {
2697    int Val = SVOp->getMaskElt(NumOperands-i-1);
2698    if (Val < 0) Val = 0;
2699    if (Val >= NumOperands) Val -= NumOperands;
2700    Mask |= Val;
2701    if (i != NumOperands - 1)
2702      Mask <<= Shift;
2703  }
2704  return Mask;
2705}
2706
2707/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
2708/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
2709/// instructions.
2710unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
2711  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2712  unsigned Mask = 0;
2713  // 8 nodes, but we only care about the last 4.
2714  for (unsigned i = 7; i >= 4; --i) {
2715    int Val = SVOp->getMaskElt(i);
2716    if (Val >= 0)
2717      Mask |= (Val - 4);
2718    if (i != 4)
2719      Mask <<= 2;
2720  }
2721  return Mask;
2722}
2723
2724/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
2725/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
2726/// instructions.
2727unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
2728  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2729  unsigned Mask = 0;
2730  // 8 nodes, but we only care about the first 4.
2731  for (int i = 3; i >= 0; --i) {
2732    int Val = SVOp->getMaskElt(i);
2733    if (Val >= 0)
2734      Mask |= Val;
2735    if (i != 0)
2736      Mask <<= 2;
2737  }
2738  return Mask;
2739}
2740
2741/// isZeroNode - Returns true if Elt is a constant zero or a floating point
2742/// constant +0.0.
2743bool X86::isZeroNode(SDValue Elt) {
2744  return ((isa<ConstantSDNode>(Elt) &&
2745           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
2746          (isa<ConstantFPSDNode>(Elt) &&
2747           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
2748}
2749
2750/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
2751/// their permute mask.
2752static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
2753                                    SelectionDAG &DAG) {
2754  MVT VT = SVOp->getValueType(0);
2755  unsigned NumElems = VT.getVectorNumElements();
2756  SmallVector<int, 8> MaskVec;
2757
2758  for (unsigned i = 0; i != NumElems; ++i) {
2759    int idx = SVOp->getMaskElt(i);
2760    if (idx < 0)
2761      MaskVec.push_back(idx);
2762    else if (idx < (int)NumElems)
2763      MaskVec.push_back(idx + NumElems);
2764    else
2765      MaskVec.push_back(idx - NumElems);
2766  }
2767  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
2768                              SVOp->getOperand(0), &MaskVec[0]);
2769}
2770
2771/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
2772/// the two vector operands have swapped position.
2773static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) {
2774  unsigned NumElems = VT.getVectorNumElements();
2775  for (unsigned i = 0; i != NumElems; ++i) {
2776    int idx = Mask[i];
2777    if (idx < 0)
2778      continue;
2779    else if (idx < (int)NumElems)
2780      Mask[i] = idx + NumElems;
2781    else
2782      Mask[i] = idx - NumElems;
2783  }
2784}
2785
2786/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
2787/// match movhlps. The lower half elements should come from upper half of
2788/// V1 (and in order), and the upper half elements should come from the upper
2789/// half of V2 (and in order).
2790static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
2791  if (Op->getValueType(0).getVectorNumElements() != 4)
2792    return false;
2793  for (unsigned i = 0, e = 2; i != e; ++i)
2794    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
2795      return false;
2796  for (unsigned i = 2; i != 4; ++i)
2797    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
2798      return false;
2799  return true;
2800}
2801
2802/// isScalarLoadToVector - Returns true if the node is a scalar load that
2803/// is promoted to a vector. It also returns the LoadSDNode by reference if
2804/// required.
2805static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
2806  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
2807    return false;
2808  N = N->getOperand(0).getNode();
2809  if (!ISD::isNON_EXTLoad(N))
2810    return false;
2811  if (LD)
2812    *LD = cast<LoadSDNode>(N);
2813  return true;
2814}
2815
2816/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
2817/// match movlp{s|d}. The lower half elements should come from lower half of
2818/// V1 (and in order), and the upper half elements should come from the upper
2819/// half of V2 (and in order). And since V1 will become the source of the
2820/// MOVLP, it must be either a vector load or a scalar load to vector.
2821static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
2822                               ShuffleVectorSDNode *Op) {
2823  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
2824    return false;
2825  // Is V2 is a vector load, don't do this transformation. We will try to use
2826  // load folding shufps op.
2827  if (ISD::isNON_EXTLoad(V2))
2828    return false;
2829
2830  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
2831
2832  if (NumElems != 2 && NumElems != 4)
2833    return false;
2834  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
2835    if (!isUndefOrEqual(Op->getMaskElt(i), i))
2836      return false;
2837  for (unsigned i = NumElems/2; i != NumElems; ++i)
2838    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
2839      return false;
2840  return true;
2841}
2842
2843/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
2844/// all the same.
2845static bool isSplatVector(SDNode *N) {
2846  if (N->getOpcode() != ISD::BUILD_VECTOR)
2847    return false;
2848
2849  SDValue SplatValue = N->getOperand(0);
2850  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
2851    if (N->getOperand(i) != SplatValue)
2852      return false;
2853  return true;
2854}
2855
2856/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
2857/// to an zero vector.
2858/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
2859static bool isZeroShuffle(ShuffleVectorSDNode *N) {
2860  SDValue V1 = N->getOperand(0);
2861  SDValue V2 = N->getOperand(1);
2862  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2863  for (unsigned i = 0; i != NumElems; ++i) {
2864    int Idx = N->getMaskElt(i);
2865    if (Idx >= (int)NumElems) {
2866      unsigned Opc = V2.getOpcode();
2867      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
2868        continue;
2869      if (Opc != ISD::BUILD_VECTOR ||
2870          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
2871        return false;
2872    } else if (Idx >= 0) {
2873      unsigned Opc = V1.getOpcode();
2874      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
2875        continue;
2876      if (Opc != ISD::BUILD_VECTOR ||
2877          !X86::isZeroNode(V1.getOperand(Idx)))
2878        return false;
2879    }
2880  }
2881  return true;
2882}
2883
2884/// getZeroVector - Returns a vector of specified type with all zero elements.
2885///
2886static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG,
2887                             DebugLoc dl) {
2888  assert(VT.isVector() && "Expected a vector type");
2889
2890  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2891  // type.  This ensures they get CSE'd.
2892  SDValue Vec;
2893  if (VT.getSizeInBits() == 64) { // MMX
2894    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2895    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
2896  } else if (HasSSE2) {  // SSE2
2897    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2898    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
2899  } else { // SSE1
2900    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
2901    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
2902  }
2903  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
2904}
2905
2906/// getOnesVector - Returns a vector of specified type with all bits set.
2907///
2908static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
2909  assert(VT.isVector() && "Expected a vector type");
2910
2911  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2912  // type.  This ensures they get CSE'd.
2913  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
2914  SDValue Vec;
2915  if (VT.getSizeInBits() == 64)  // MMX
2916    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
2917  else                                              // SSE
2918    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
2919  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
2920}
2921
2922
2923/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
2924/// that point to V2 points to its first element.
2925static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
2926  MVT VT = SVOp->getValueType(0);
2927  unsigned NumElems = VT.getVectorNumElements();
2928
2929  bool Changed = false;
2930  SmallVector<int, 8> MaskVec;
2931  SVOp->getMask(MaskVec);
2932
2933  for (unsigned i = 0; i != NumElems; ++i) {
2934    if (MaskVec[i] > (int)NumElems) {
2935      MaskVec[i] = NumElems;
2936      Changed = true;
2937    }
2938  }
2939  if (Changed)
2940    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
2941                                SVOp->getOperand(1), &MaskVec[0]);
2942  return SDValue(SVOp, 0);
2943}
2944
2945/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
2946/// operation of specified width.
2947static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
2948                       SDValue V2) {
2949  unsigned NumElems = VT.getVectorNumElements();
2950  SmallVector<int, 8> Mask;
2951  Mask.push_back(NumElems);
2952  for (unsigned i = 1; i != NumElems; ++i)
2953    Mask.push_back(i);
2954  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
2955}
2956
2957/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
2958static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
2959                          SDValue V2) {
2960  unsigned NumElems = VT.getVectorNumElements();
2961  SmallVector<int, 8> Mask;
2962  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
2963    Mask.push_back(i);
2964    Mask.push_back(i + NumElems);
2965  }
2966  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
2967}
2968
2969/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
2970static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
2971                          SDValue V2) {
2972  unsigned NumElems = VT.getVectorNumElements();
2973  unsigned Half = NumElems/2;
2974  SmallVector<int, 8> Mask;
2975  for (unsigned i = 0; i != Half; ++i) {
2976    Mask.push_back(i + Half);
2977    Mask.push_back(i + NumElems + Half);
2978  }
2979  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
2980}
2981
2982/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
2983static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
2984                            bool HasSSE2) {
2985  if (SV->getValueType(0).getVectorNumElements() <= 4)
2986    return SDValue(SV, 0);
2987
2988  MVT PVT = MVT::v4f32;
2989  MVT VT = SV->getValueType(0);
2990  DebugLoc dl = SV->getDebugLoc();
2991  SDValue V1 = SV->getOperand(0);
2992  int NumElems = VT.getVectorNumElements();
2993  int EltNo = SV->getSplatIndex();
2994
2995  // unpack elements to the correct location
2996  while (NumElems > 4) {
2997    if (EltNo < NumElems/2) {
2998      V1 = getUnpackl(DAG, dl, VT, V1, V1);
2999    } else {
3000      V1 = getUnpackh(DAG, dl, VT, V1, V1);
3001      EltNo -= NumElems/2;
3002    }
3003    NumElems >>= 1;
3004  }
3005
3006  // Perform the splat.
3007  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3008  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3009  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3010  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3011}
3012
3013/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3014/// vector of zero or undef vector.  This produces a shuffle where the low
3015/// element of V2 is swizzled into the zero/undef vector, landing at element
3016/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3017static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3018                                             bool isZero, bool HasSSE2,
3019                                             SelectionDAG &DAG) {
3020  MVT VT = V2.getValueType();
3021  SDValue V1 = isZero
3022    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3023  unsigned NumElems = VT.getVectorNumElements();
3024  SmallVector<int, 16> MaskVec;
3025  for (unsigned i = 0; i != NumElems; ++i)
3026    // If this is the insertion idx, put the low elt of V2 here.
3027    MaskVec.push_back(i == Idx ? NumElems : i);
3028  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3029}
3030
3031/// getNumOfConsecutiveZeros - Return the number of elements in a result of
3032/// a shuffle that is zero.
3033static
3034unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3035                                  bool Low, SelectionDAG &DAG) {
3036  unsigned NumZeros = 0;
3037  for (int i = 0; i < NumElems; ++i) {
3038    unsigned Index = Low ? i : NumElems-i-1;
3039    int Idx = SVOp->getMaskElt(Index);
3040    if (Idx < 0) {
3041      ++NumZeros;
3042      continue;
3043    }
3044    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3045    if (Elt.getNode() && X86::isZeroNode(Elt))
3046      ++NumZeros;
3047    else
3048      break;
3049  }
3050  return NumZeros;
3051}
3052
3053/// isVectorShift - Returns true if the shuffle can be implemented as a
3054/// logical left or right shift of a vector.
3055/// FIXME: split into pslldqi, psrldqi, palignr variants.
3056static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3057                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3058  int NumElems = SVOp->getValueType(0).getVectorNumElements();
3059
3060  isLeft = true;
3061  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3062  if (!NumZeros) {
3063    isLeft = false;
3064    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3065    if (!NumZeros)
3066      return false;
3067  }
3068  bool SeenV1 = false;
3069  bool SeenV2 = false;
3070  for (int i = NumZeros; i < NumElems; ++i) {
3071    int Val = isLeft ? (i - NumZeros) : i;
3072    int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3073    if (Idx < 0)
3074      continue;
3075    if (Idx < NumElems)
3076      SeenV1 = true;
3077    else {
3078      Idx -= NumElems;
3079      SeenV2 = true;
3080    }
3081    if (Idx != Val)
3082      return false;
3083  }
3084  if (SeenV1 && SeenV2)
3085    return false;
3086
3087  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3088  ShAmt = NumZeros;
3089  return true;
3090}
3091
3092
3093/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3094///
3095static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3096                                       unsigned NumNonZero, unsigned NumZero,
3097                                       SelectionDAG &DAG, TargetLowering &TLI) {
3098  if (NumNonZero > 8)
3099    return SDValue();
3100
3101  DebugLoc dl = Op.getDebugLoc();
3102  SDValue V(0, 0);
3103  bool First = true;
3104  for (unsigned i = 0; i < 16; ++i) {
3105    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3106    if (ThisIsNonZero && First) {
3107      if (NumZero)
3108        V = getZeroVector(MVT::v8i16, true, DAG, dl);
3109      else
3110        V = DAG.getUNDEF(MVT::v8i16);
3111      First = false;
3112    }
3113
3114    if ((i & 1) != 0) {
3115      SDValue ThisElt(0, 0), LastElt(0, 0);
3116      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3117      if (LastIsNonZero) {
3118        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3119                              MVT::i16, Op.getOperand(i-1));
3120      }
3121      if (ThisIsNonZero) {
3122        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3123        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3124                              ThisElt, DAG.getConstant(8, MVT::i8));
3125        if (LastIsNonZero)
3126          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3127      } else
3128        ThisElt = LastElt;
3129
3130      if (ThisElt.getNode())
3131        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3132                        DAG.getIntPtrConstant(i/2));
3133    }
3134  }
3135
3136  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3137}
3138
3139/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3140///
3141static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3142                                       unsigned NumNonZero, unsigned NumZero,
3143                                       SelectionDAG &DAG, TargetLowering &TLI) {
3144  if (NumNonZero > 4)
3145    return SDValue();
3146
3147  DebugLoc dl = Op.getDebugLoc();
3148  SDValue V(0, 0);
3149  bool First = true;
3150  for (unsigned i = 0; i < 8; ++i) {
3151    bool isNonZero = (NonZeros & (1 << i)) != 0;
3152    if (isNonZero) {
3153      if (First) {
3154        if (NumZero)
3155          V = getZeroVector(MVT::v8i16, true, DAG, dl);
3156        else
3157          V = DAG.getUNDEF(MVT::v8i16);
3158        First = false;
3159      }
3160      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3161                      MVT::v8i16, V, Op.getOperand(i),
3162                      DAG.getIntPtrConstant(i));
3163    }
3164  }
3165
3166  return V;
3167}
3168
3169/// getVShift - Return a vector logical shift node.
3170///
3171static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
3172                         unsigned NumBits, SelectionDAG &DAG,
3173                         const TargetLowering &TLI, DebugLoc dl) {
3174  bool isMMX = VT.getSizeInBits() == 64;
3175  MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3176  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3177  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3178  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3179                     DAG.getNode(Opc, dl, ShVT, SrcOp,
3180                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3181}
3182
3183SDValue
3184X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3185  DebugLoc dl = Op.getDebugLoc();
3186  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3187  if (ISD::isBuildVectorAllZeros(Op.getNode())
3188      || ISD::isBuildVectorAllOnes(Op.getNode())) {
3189    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3190    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3191    // eliminated on x86-32 hosts.
3192    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3193      return Op;
3194
3195    if (ISD::isBuildVectorAllOnes(Op.getNode()))
3196      return getOnesVector(Op.getValueType(), DAG, dl);
3197    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3198  }
3199
3200  MVT VT = Op.getValueType();
3201  MVT EVT = VT.getVectorElementType();
3202  unsigned EVTBits = EVT.getSizeInBits();
3203
3204  unsigned NumElems = Op.getNumOperands();
3205  unsigned NumZero  = 0;
3206  unsigned NumNonZero = 0;
3207  unsigned NonZeros = 0;
3208  bool IsAllConstants = true;
3209  SmallSet<SDValue, 8> Values;
3210  for (unsigned i = 0; i < NumElems; ++i) {
3211    SDValue Elt = Op.getOperand(i);
3212    if (Elt.getOpcode() == ISD::UNDEF)
3213      continue;
3214    Values.insert(Elt);
3215    if (Elt.getOpcode() != ISD::Constant &&
3216        Elt.getOpcode() != ISD::ConstantFP)
3217      IsAllConstants = false;
3218    if (X86::isZeroNode(Elt))
3219      NumZero++;
3220    else {
3221      NonZeros |= (1 << i);
3222      NumNonZero++;
3223    }
3224  }
3225
3226  if (NumNonZero == 0) {
3227    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3228    return DAG.getUNDEF(VT);
3229  }
3230
3231  // Special case for single non-zero, non-undef, element.
3232  if (NumNonZero == 1) {
3233    unsigned Idx = CountTrailingZeros_32(NonZeros);
3234    SDValue Item = Op.getOperand(Idx);
3235
3236    // If this is an insertion of an i64 value on x86-32, and if the top bits of
3237    // the value are obviously zero, truncate the value to i32 and do the
3238    // insertion that way.  Only do this if the value is non-constant or if the
3239    // value is a constant being inserted into element 0.  It is cheaper to do
3240    // a constant pool load than it is to do a movd + shuffle.
3241    if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
3242        (!IsAllConstants || Idx == 0)) {
3243      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3244        // Handle MMX and SSE both.
3245        MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3246        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3247
3248        // Truncate the value (which may itself be a constant) to i32, and
3249        // convert it to a vector with movd (S2V+shuffle to zero extend).
3250        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3251        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3252        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3253                                           Subtarget->hasSSE2(), DAG);
3254
3255        // Now we have our 32-bit value zero extended in the low element of
3256        // a vector.  If Idx != 0, swizzle it into place.
3257        if (Idx != 0) {
3258          SmallVector<int, 4> Mask;
3259          Mask.push_back(Idx);
3260          for (unsigned i = 1; i != VecElts; ++i)
3261            Mask.push_back(i);
3262          Item = DAG.getVectorShuffle(VecVT, dl, Item,
3263                                      DAG.getUNDEF(Item.getValueType()),
3264                                      &Mask[0]);
3265        }
3266        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3267      }
3268    }
3269
3270    // If we have a constant or non-constant insertion into the low element of
3271    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3272    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3273    // depending on what the source datatype is.
3274    if (Idx == 0) {
3275      if (NumZero == 0) {
3276        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3277      } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 ||
3278          (EVT == MVT::i64 && Subtarget->is64Bit())) {
3279        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3280        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3281        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3282                                           DAG);
3283      } else if (EVT == MVT::i16 || EVT == MVT::i8) {
3284        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3285        MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3286        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3287        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3288                                           Subtarget->hasSSE2(), DAG);
3289        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3290      }
3291    }
3292
3293    // Is it a vector logical left shift?
3294    if (NumElems == 2 && Idx == 1 &&
3295        X86::isZeroNode(Op.getOperand(0)) &&
3296        !X86::isZeroNode(Op.getOperand(1))) {
3297      unsigned NumBits = VT.getSizeInBits();
3298      return getVShift(true, VT,
3299                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3300                                   VT, Op.getOperand(1)),
3301                       NumBits/2, DAG, *this, dl);
3302    }
3303
3304    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3305      return SDValue();
3306
3307    // Otherwise, if this is a vector with i32 or f32 elements, and the element
3308    // is a non-constant being inserted into an element other than the low one,
3309    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3310    // movd/movss) to move this into the low element, then shuffle it into
3311    // place.
3312    if (EVTBits == 32) {
3313      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3314
3315      // Turn it into a shuffle of zero and zero-extended scalar to vector.
3316      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3317                                         Subtarget->hasSSE2(), DAG);
3318      SmallVector<int, 8> MaskVec;
3319      for (unsigned i = 0; i < NumElems; i++)
3320        MaskVec.push_back(i == Idx ? 0 : 1);
3321      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3322    }
3323  }
3324
3325  // Splat is obviously ok. Let legalizer expand it to a shuffle.
3326  if (Values.size() == 1)
3327    return SDValue();
3328
3329  // A vector full of immediates; various special cases are already
3330  // handled, so this is best done with a single constant-pool load.
3331  if (IsAllConstants)
3332    return SDValue();
3333
3334  // Let legalizer expand 2-wide build_vectors.
3335  if (EVTBits == 64) {
3336    if (NumNonZero == 1) {
3337      // One half is zero or undef.
3338      unsigned Idx = CountTrailingZeros_32(NonZeros);
3339      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3340                                 Op.getOperand(Idx));
3341      return getShuffleVectorZeroOrUndef(V2, Idx, true,
3342                                         Subtarget->hasSSE2(), DAG);
3343    }
3344    return SDValue();
3345  }
3346
3347  // If element VT is < 32 bits, convert it to inserts into a zero vector.
3348  if (EVTBits == 8 && NumElems == 16) {
3349    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3350                                        *this);
3351    if (V.getNode()) return V;
3352  }
3353
3354  if (EVTBits == 16 && NumElems == 8) {
3355    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3356                                        *this);
3357    if (V.getNode()) return V;
3358  }
3359
3360  // If element VT is == 32 bits, turn it into a number of shuffles.
3361  SmallVector<SDValue, 8> V;
3362  V.resize(NumElems);
3363  if (NumElems == 4 && NumZero > 0) {
3364    for (unsigned i = 0; i < 4; ++i) {
3365      bool isZero = !(NonZeros & (1 << i));
3366      if (isZero)
3367        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3368      else
3369        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3370    }
3371
3372    for (unsigned i = 0; i < 2; ++i) {
3373      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3374        default: break;
3375        case 0:
3376          V[i] = V[i*2];  // Must be a zero vector.
3377          break;
3378        case 1:
3379          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3380          break;
3381        case 2:
3382          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3383          break;
3384        case 3:
3385          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3386          break;
3387      }
3388    }
3389
3390    SmallVector<int, 8> MaskVec;
3391    bool Reverse = (NonZeros & 0x3) == 2;
3392    for (unsigned i = 0; i < 2; ++i)
3393      MaskVec.push_back(Reverse ? 1-i : i);
3394    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3395    for (unsigned i = 0; i < 2; ++i)
3396      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
3397    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
3398  }
3399
3400  if (Values.size() > 2) {
3401    // If we have SSE 4.1, Expand into a number of inserts unless the number of
3402    // values to be inserted is equal to the number of elements, in which case
3403    // use the unpack code below in the hopes of matching the consecutive elts
3404    // load merge pattern for shuffles.
3405    // FIXME: We could probably just check that here directly.
3406    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
3407        getSubtarget()->hasSSE41()) {
3408      V[0] = DAG.getUNDEF(VT);
3409      for (unsigned i = 0; i < NumElems; ++i)
3410        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
3411          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
3412                             Op.getOperand(i), DAG.getIntPtrConstant(i));
3413      return V[0];
3414    }
3415    // Expand into a number of unpckl*.
3416    // e.g. for v4f32
3417    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3418    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3419    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
3420    for (unsigned i = 0; i < NumElems; ++i)
3421      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3422    NumElems >>= 1;
3423    while (NumElems != 0) {
3424      for (unsigned i = 0; i < NumElems; ++i)
3425        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
3426      NumElems >>= 1;
3427    }
3428    return V[0];
3429  }
3430
3431  return SDValue();
3432}
3433
3434// v8i16 shuffles - Prefer shuffles in the following order:
3435// 1. [all]   pshuflw, pshufhw, optional move
3436// 2. [ssse3] 1 x pshufb
3437// 3. [ssse3] 2 x pshufb + 1 x por
3438// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
3439static
3440SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
3441                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
3442  SDValue V1 = SVOp->getOperand(0);
3443  SDValue V2 = SVOp->getOperand(1);
3444  DebugLoc dl = SVOp->getDebugLoc();
3445  SmallVector<int, 8> MaskVals;
3446
3447  // Determine if more than 1 of the words in each of the low and high quadwords
3448  // of the result come from the same quadword of one of the two inputs.  Undef
3449  // mask values count as coming from any quadword, for better codegen.
3450  SmallVector<unsigned, 4> LoQuad(4);
3451  SmallVector<unsigned, 4> HiQuad(4);
3452  BitVector InputQuads(4);
3453  for (unsigned i = 0; i < 8; ++i) {
3454    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
3455    int EltIdx = SVOp->getMaskElt(i);
3456    MaskVals.push_back(EltIdx);
3457    if (EltIdx < 0) {
3458      ++Quad[0];
3459      ++Quad[1];
3460      ++Quad[2];
3461      ++Quad[3];
3462      continue;
3463    }
3464    ++Quad[EltIdx / 4];
3465    InputQuads.set(EltIdx / 4);
3466  }
3467
3468  int BestLoQuad = -1;
3469  unsigned MaxQuad = 1;
3470  for (unsigned i = 0; i < 4; ++i) {
3471    if (LoQuad[i] > MaxQuad) {
3472      BestLoQuad = i;
3473      MaxQuad = LoQuad[i];
3474    }
3475  }
3476
3477  int BestHiQuad = -1;
3478  MaxQuad = 1;
3479  for (unsigned i = 0; i < 4; ++i) {
3480    if (HiQuad[i] > MaxQuad) {
3481      BestHiQuad = i;
3482      MaxQuad = HiQuad[i];
3483    }
3484  }
3485
3486  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
3487  // of the two input vectors, shuffle them into one input vector so only a
3488  // single pshufb instruction is necessary. If There are more than 2 input
3489  // quads, disable the next transformation since it does not help SSSE3.
3490  bool V1Used = InputQuads[0] || InputQuads[1];
3491  bool V2Used = InputQuads[2] || InputQuads[3];
3492  if (TLI.getSubtarget()->hasSSSE3()) {
3493    if (InputQuads.count() == 2 && V1Used && V2Used) {
3494      BestLoQuad = InputQuads.find_first();
3495      BestHiQuad = InputQuads.find_next(BestLoQuad);
3496    }
3497    if (InputQuads.count() > 2) {
3498      BestLoQuad = -1;
3499      BestHiQuad = -1;
3500    }
3501  }
3502
3503  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
3504  // the shuffle mask.  If a quad is scored as -1, that means that it contains
3505  // words from all 4 input quadwords.
3506  SDValue NewV;
3507  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
3508    SmallVector<int, 8> MaskV;
3509    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
3510    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
3511    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
3512                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
3513                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
3514    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
3515
3516    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
3517    // source words for the shuffle, to aid later transformations.
3518    bool AllWordsInNewV = true;
3519    bool InOrder[2] = { true, true };
3520    for (unsigned i = 0; i != 8; ++i) {
3521      int idx = MaskVals[i];
3522      if (idx != (int)i)
3523        InOrder[i/4] = false;
3524      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
3525        continue;
3526      AllWordsInNewV = false;
3527      break;
3528    }
3529
3530    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
3531    if (AllWordsInNewV) {
3532      for (int i = 0; i != 8; ++i) {
3533        int idx = MaskVals[i];
3534        if (idx < 0)
3535          continue;
3536        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
3537        if ((idx != i) && idx < 4)
3538          pshufhw = false;
3539        if ((idx != i) && idx > 3)
3540          pshuflw = false;
3541      }
3542      V1 = NewV;
3543      V2Used = false;
3544      BestLoQuad = 0;
3545      BestHiQuad = 1;
3546    }
3547
3548    // If we've eliminated the use of V2, and the new mask is a pshuflw or
3549    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
3550    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
3551      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
3552                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
3553    }
3554  }
3555
3556  // If we have SSSE3, and all words of the result are from 1 input vector,
3557  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
3558  // is present, fall back to case 4.
3559  if (TLI.getSubtarget()->hasSSSE3()) {
3560    SmallVector<SDValue,16> pshufbMask;
3561
3562    // If we have elements from both input vectors, set the high bit of the
3563    // shuffle mask element to zero out elements that come from V2 in the V1
3564    // mask, and elements that come from V1 in the V2 mask, so that the two
3565    // results can be OR'd together.
3566    bool TwoInputs = V1Used && V2Used;
3567    for (unsigned i = 0; i != 8; ++i) {
3568      int EltIdx = MaskVals[i] * 2;
3569      if (TwoInputs && (EltIdx >= 16)) {
3570        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3571        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3572        continue;
3573      }
3574      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
3575      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
3576    }
3577    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
3578    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
3579                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3580                                 MVT::v16i8, &pshufbMask[0], 16));
3581    if (!TwoInputs)
3582      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3583
3584    // Calculate the shuffle mask for the second input, shuffle it, and
3585    // OR it with the first shuffled input.
3586    pshufbMask.clear();
3587    for (unsigned i = 0; i != 8; ++i) {
3588      int EltIdx = MaskVals[i] * 2;
3589      if (EltIdx < 16) {
3590        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3591        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3592        continue;
3593      }
3594      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
3595      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
3596    }
3597    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
3598    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
3599                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3600                                 MVT::v16i8, &pshufbMask[0], 16));
3601    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
3602    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3603  }
3604
3605  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
3606  // and update MaskVals with new element order.
3607  BitVector InOrder(8);
3608  if (BestLoQuad >= 0) {
3609    SmallVector<int, 8> MaskV;
3610    for (int i = 0; i != 4; ++i) {
3611      int idx = MaskVals[i];
3612      if (idx < 0) {
3613        MaskV.push_back(-1);
3614        InOrder.set(i);
3615      } else if ((idx / 4) == BestLoQuad) {
3616        MaskV.push_back(idx & 3);
3617        InOrder.set(i);
3618      } else {
3619        MaskV.push_back(-1);
3620      }
3621    }
3622    for (unsigned i = 4; i != 8; ++i)
3623      MaskV.push_back(i);
3624    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
3625                                &MaskV[0]);
3626  }
3627
3628  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
3629  // and update MaskVals with the new element order.
3630  if (BestHiQuad >= 0) {
3631    SmallVector<int, 8> MaskV;
3632    for (unsigned i = 0; i != 4; ++i)
3633      MaskV.push_back(i);
3634    for (unsigned i = 4; i != 8; ++i) {
3635      int idx = MaskVals[i];
3636      if (idx < 0) {
3637        MaskV.push_back(-1);
3638        InOrder.set(i);
3639      } else if ((idx / 4) == BestHiQuad) {
3640        MaskV.push_back((idx & 3) + 4);
3641        InOrder.set(i);
3642      } else {
3643        MaskV.push_back(-1);
3644      }
3645    }
3646    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
3647                                &MaskV[0]);
3648  }
3649
3650  // In case BestHi & BestLo were both -1, which means each quadword has a word
3651  // from each of the four input quadwords, calculate the InOrder bitvector now
3652  // before falling through to the insert/extract cleanup.
3653  if (BestLoQuad == -1 && BestHiQuad == -1) {
3654    NewV = V1;
3655    for (int i = 0; i != 8; ++i)
3656      if (MaskVals[i] < 0 || MaskVals[i] == i)
3657        InOrder.set(i);
3658  }
3659
3660  // The other elements are put in the right place using pextrw and pinsrw.
3661  for (unsigned i = 0; i != 8; ++i) {
3662    if (InOrder[i])
3663      continue;
3664    int EltIdx = MaskVals[i];
3665    if (EltIdx < 0)
3666      continue;
3667    SDValue ExtOp = (EltIdx < 8)
3668    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
3669                  DAG.getIntPtrConstant(EltIdx))
3670    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
3671                  DAG.getIntPtrConstant(EltIdx - 8));
3672    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
3673                       DAG.getIntPtrConstant(i));
3674  }
3675  return NewV;
3676}
3677
3678// v16i8 shuffles - Prefer shuffles in the following order:
3679// 1. [ssse3] 1 x pshufb
3680// 2. [ssse3] 2 x pshufb + 1 x por
3681// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
3682static
3683SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
3684                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
3685  SDValue V1 = SVOp->getOperand(0);
3686  SDValue V2 = SVOp->getOperand(1);
3687  DebugLoc dl = SVOp->getDebugLoc();
3688  SmallVector<int, 16> MaskVals;
3689  SVOp->getMask(MaskVals);
3690
3691  // If we have SSSE3, case 1 is generated when all result bytes come from
3692  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
3693  // present, fall back to case 3.
3694  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
3695  bool V1Only = true;
3696  bool V2Only = true;
3697  for (unsigned i = 0; i < 16; ++i) {
3698    int EltIdx = MaskVals[i];
3699    if (EltIdx < 0)
3700      continue;
3701    if (EltIdx < 16)
3702      V2Only = false;
3703    else
3704      V1Only = false;
3705  }
3706
3707  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
3708  if (TLI.getSubtarget()->hasSSSE3()) {
3709    SmallVector<SDValue,16> pshufbMask;
3710
3711    // If all result elements are from one input vector, then only translate
3712    // undef mask values to 0x80 (zero out result) in the pshufb mask.
3713    //
3714    // Otherwise, we have elements from both input vectors, and must zero out
3715    // elements that come from V2 in the first mask, and V1 in the second mask
3716    // so that we can OR them together.
3717    bool TwoInputs = !(V1Only || V2Only);
3718    for (unsigned i = 0; i != 16; ++i) {
3719      int EltIdx = MaskVals[i];
3720      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
3721        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3722        continue;
3723      }
3724      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
3725    }
3726    // If all the elements are from V2, assign it to V1 and return after
3727    // building the first pshufb.
3728    if (V2Only)
3729      V1 = V2;
3730    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
3731                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3732                                 MVT::v16i8, &pshufbMask[0], 16));
3733    if (!TwoInputs)
3734      return V1;
3735
3736    // Calculate the shuffle mask for the second input, shuffle it, and
3737    // OR it with the first shuffled input.
3738    pshufbMask.clear();
3739    for (unsigned i = 0; i != 16; ++i) {
3740      int EltIdx = MaskVals[i];
3741      if (EltIdx < 16) {
3742        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3743        continue;
3744      }
3745      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
3746    }
3747    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
3748                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3749                                 MVT::v16i8, &pshufbMask[0], 16));
3750    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
3751  }
3752
3753  // No SSSE3 - Calculate in place words and then fix all out of place words
3754  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
3755  // the 16 different words that comprise the two doublequadword input vectors.
3756  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3757  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
3758  SDValue NewV = V2Only ? V2 : V1;
3759  for (int i = 0; i != 8; ++i) {
3760    int Elt0 = MaskVals[i*2];
3761    int Elt1 = MaskVals[i*2+1];
3762
3763    // This word of the result is all undef, skip it.
3764    if (Elt0 < 0 && Elt1 < 0)
3765      continue;
3766
3767    // This word of the result is already in the correct place, skip it.
3768    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
3769      continue;
3770    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
3771      continue;
3772
3773    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
3774    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
3775    SDValue InsElt;
3776
3777    // If Elt0 and Elt1 are defined, are consecutive, and can be load
3778    // using a single extract together, load it and store it.
3779    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
3780      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
3781                           DAG.getIntPtrConstant(Elt1 / 2));
3782      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
3783                        DAG.getIntPtrConstant(i));
3784      continue;
3785    }
3786
3787    // If Elt1 is defined, extract it from the appropriate source.  If the
3788    // source byte is not also odd, shift the extracted word left 8 bits
3789    // otherwise clear the bottom 8 bits if we need to do an or.
3790    if (Elt1 >= 0) {
3791      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
3792                           DAG.getIntPtrConstant(Elt1 / 2));
3793      if ((Elt1 & 1) == 0)
3794        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
3795                             DAG.getConstant(8, TLI.getShiftAmountTy()));
3796      else if (Elt0 >= 0)
3797        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
3798                             DAG.getConstant(0xFF00, MVT::i16));
3799    }
3800    // If Elt0 is defined, extract it from the appropriate source.  If the
3801    // source byte is not also even, shift the extracted word right 8 bits. If
3802    // Elt1 was also defined, OR the extracted values together before
3803    // inserting them in the result.
3804    if (Elt0 >= 0) {
3805      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
3806                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
3807      if ((Elt0 & 1) != 0)
3808        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
3809                              DAG.getConstant(8, TLI.getShiftAmountTy()));
3810      else if (Elt1 >= 0)
3811        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
3812                             DAG.getConstant(0x00FF, MVT::i16));
3813      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
3814                         : InsElt0;
3815    }
3816    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
3817                       DAG.getIntPtrConstant(i));
3818  }
3819  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
3820}
3821
3822/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
3823/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
3824/// done when every pair / quad of shuffle mask elements point to elements in
3825/// the right sequence. e.g.
3826/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
3827static
3828SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
3829                                 SelectionDAG &DAG,
3830                                 TargetLowering &TLI, DebugLoc dl) {
3831  MVT VT = SVOp->getValueType(0);
3832  SDValue V1 = SVOp->getOperand(0);
3833  SDValue V2 = SVOp->getOperand(1);
3834  unsigned NumElems = VT.getVectorNumElements();
3835  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
3836  MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
3837  MVT MaskEltVT = MaskVT.getVectorElementType();
3838  MVT NewVT = MaskVT;
3839  switch (VT.getSimpleVT()) {
3840  default: assert(false && "Unexpected!");
3841  case MVT::v4f32: NewVT = MVT::v2f64; break;
3842  case MVT::v4i32: NewVT = MVT::v2i64; break;
3843  case MVT::v8i16: NewVT = MVT::v4i32; break;
3844  case MVT::v16i8: NewVT = MVT::v4i32; break;
3845  }
3846
3847  if (NewWidth == 2) {
3848    if (VT.isInteger())
3849      NewVT = MVT::v2i64;
3850    else
3851      NewVT = MVT::v2f64;
3852  }
3853  int Scale = NumElems / NewWidth;
3854  SmallVector<int, 8> MaskVec;
3855  for (unsigned i = 0; i < NumElems; i += Scale) {
3856    int StartIdx = -1;
3857    for (int j = 0; j < Scale; ++j) {
3858      int EltIdx = SVOp->getMaskElt(i+j);
3859      if (EltIdx < 0)
3860        continue;
3861      if (StartIdx == -1)
3862        StartIdx = EltIdx - (EltIdx % Scale);
3863      if (EltIdx != StartIdx + j)
3864        return SDValue();
3865    }
3866    if (StartIdx == -1)
3867      MaskVec.push_back(-1);
3868    else
3869      MaskVec.push_back(StartIdx / Scale);
3870  }
3871
3872  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
3873  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
3874  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
3875}
3876
3877/// getVZextMovL - Return a zero-extending vector move low node.
3878///
3879static SDValue getVZextMovL(MVT VT, MVT OpVT,
3880                            SDValue SrcOp, SelectionDAG &DAG,
3881                            const X86Subtarget *Subtarget, DebugLoc dl) {
3882  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
3883    LoadSDNode *LD = NULL;
3884    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
3885      LD = dyn_cast<LoadSDNode>(SrcOp);
3886    if (!LD) {
3887      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
3888      // instead.
3889      MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
3890      if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
3891          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
3892          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
3893          SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
3894        // PR2108
3895        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
3896        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3897                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
3898                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3899                                                   OpVT,
3900                                                   SrcOp.getOperand(0)
3901                                                          .getOperand(0))));
3902      }
3903    }
3904  }
3905
3906  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3907                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
3908                                 DAG.getNode(ISD::BIT_CONVERT, dl,
3909                                             OpVT, SrcOp)));
3910}
3911
3912/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
3913/// shuffles.
3914static SDValue
3915LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3916  SDValue V1 = SVOp->getOperand(0);
3917  SDValue V2 = SVOp->getOperand(1);
3918  DebugLoc dl = SVOp->getDebugLoc();
3919  MVT VT = SVOp->getValueType(0);
3920
3921  SmallVector<std::pair<int, int>, 8> Locs;
3922  Locs.resize(4);
3923  SmallVector<int, 8> Mask1(4U, -1);
3924  SmallVector<int, 8> PermMask;
3925  SVOp->getMask(PermMask);
3926
3927  unsigned NumHi = 0;
3928  unsigned NumLo = 0;
3929  for (unsigned i = 0; i != 4; ++i) {
3930    int Idx = PermMask[i];
3931    if (Idx < 0) {
3932      Locs[i] = std::make_pair(-1, -1);
3933    } else {
3934      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
3935      if (Idx < 4) {
3936        Locs[i] = std::make_pair(0, NumLo);
3937        Mask1[NumLo] = Idx;
3938        NumLo++;
3939      } else {
3940        Locs[i] = std::make_pair(1, NumHi);
3941        if (2+NumHi < 4)
3942          Mask1[2+NumHi] = Idx;
3943        NumHi++;
3944      }
3945    }
3946  }
3947
3948  if (NumLo <= 2 && NumHi <= 2) {
3949    // If no more than two elements come from either vector. This can be
3950    // implemented with two shuffles. First shuffle gather the elements.
3951    // The second shuffle, which takes the first shuffle as both of its
3952    // vector operands, put the elements into the right order.
3953    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
3954
3955    SmallVector<int, 8> Mask2(4U, -1);
3956
3957    for (unsigned i = 0; i != 4; ++i) {
3958      if (Locs[i].first == -1)
3959        continue;
3960      else {
3961        unsigned Idx = (i < 2) ? 0 : 4;
3962        Idx += Locs[i].first * 2 + Locs[i].second;
3963        Mask2[i] = Idx;
3964      }
3965    }
3966
3967    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
3968  } else if (NumLo == 3 || NumHi == 3) {
3969    // Otherwise, we must have three elements from one vector, call it X, and
3970    // one element from the other, call it Y.  First, use a shufps to build an
3971    // intermediate vector with the one element from Y and the element from X
3972    // that will be in the same half in the final destination (the indexes don't
3973    // matter). Then, use a shufps to build the final vector, taking the half
3974    // containing the element from Y from the intermediate, and the other half
3975    // from X.
3976    if (NumHi == 3) {
3977      // Normalize it so the 3 elements come from V1.
3978      CommuteVectorShuffleMask(PermMask, VT);
3979      std::swap(V1, V2);
3980    }
3981
3982    // Find the element from V2.
3983    unsigned HiIndex;
3984    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
3985      int Val = PermMask[HiIndex];
3986      if (Val < 0)
3987        continue;
3988      if (Val >= 4)
3989        break;
3990    }
3991
3992    Mask1[0] = PermMask[HiIndex];
3993    Mask1[1] = -1;
3994    Mask1[2] = PermMask[HiIndex^1];
3995    Mask1[3] = -1;
3996    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
3997
3998    if (HiIndex >= 2) {
3999      Mask1[0] = PermMask[0];
4000      Mask1[1] = PermMask[1];
4001      Mask1[2] = HiIndex & 1 ? 6 : 4;
4002      Mask1[3] = HiIndex & 1 ? 4 : 6;
4003      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4004    } else {
4005      Mask1[0] = HiIndex & 1 ? 2 : 0;
4006      Mask1[1] = HiIndex & 1 ? 0 : 2;
4007      Mask1[2] = PermMask[2];
4008      Mask1[3] = PermMask[3];
4009      if (Mask1[2] >= 0)
4010        Mask1[2] += 4;
4011      if (Mask1[3] >= 0)
4012        Mask1[3] += 4;
4013      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4014    }
4015  }
4016
4017  // Break it into (shuffle shuffle_hi, shuffle_lo).
4018  Locs.clear();
4019  SmallVector<int,8> LoMask(4U, -1);
4020  SmallVector<int,8> HiMask(4U, -1);
4021
4022  SmallVector<int,8> *MaskPtr = &LoMask;
4023  unsigned MaskIdx = 0;
4024  unsigned LoIdx = 0;
4025  unsigned HiIdx = 2;
4026  for (unsigned i = 0; i != 4; ++i) {
4027    if (i == 2) {
4028      MaskPtr = &HiMask;
4029      MaskIdx = 1;
4030      LoIdx = 0;
4031      HiIdx = 2;
4032    }
4033    int Idx = PermMask[i];
4034    if (Idx < 0) {
4035      Locs[i] = std::make_pair(-1, -1);
4036    } else if (Idx < 4) {
4037      Locs[i] = std::make_pair(MaskIdx, LoIdx);
4038      (*MaskPtr)[LoIdx] = Idx;
4039      LoIdx++;
4040    } else {
4041      Locs[i] = std::make_pair(MaskIdx, HiIdx);
4042      (*MaskPtr)[HiIdx] = Idx;
4043      HiIdx++;
4044    }
4045  }
4046
4047  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4048  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4049  SmallVector<int, 8> MaskOps;
4050  for (unsigned i = 0; i != 4; ++i) {
4051    if (Locs[i].first == -1) {
4052      MaskOps.push_back(-1);
4053    } else {
4054      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4055      MaskOps.push_back(Idx);
4056    }
4057  }
4058  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4059}
4060
4061SDValue
4062X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4063  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4064  SDValue V1 = Op.getOperand(0);
4065  SDValue V2 = Op.getOperand(1);
4066  MVT VT = Op.getValueType();
4067  DebugLoc dl = Op.getDebugLoc();
4068  unsigned NumElems = VT.getVectorNumElements();
4069  bool isMMX = VT.getSizeInBits() == 64;
4070  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4071  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4072  bool V1IsSplat = false;
4073  bool V2IsSplat = false;
4074
4075  if (isZeroShuffle(SVOp))
4076    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4077
4078  // Promote splats to v4f32.
4079  if (SVOp->isSplat()) {
4080    if (isMMX || NumElems < 4)
4081      return Op;
4082    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4083  }
4084
4085  // If the shuffle can be profitably rewritten as a narrower shuffle, then
4086  // do it!
4087  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4088    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4089    if (NewOp.getNode())
4090      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4091                         LowerVECTOR_SHUFFLE(NewOp, DAG));
4092  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4093    // FIXME: Figure out a cleaner way to do this.
4094    // Try to make use of movq to zero out the top part.
4095    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4096      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4097      if (NewOp.getNode()) {
4098        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4099          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4100                              DAG, Subtarget, dl);
4101      }
4102    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4103      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4104      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4105        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4106                            DAG, Subtarget, dl);
4107    }
4108  }
4109
4110  if (X86::isPSHUFDMask(SVOp))
4111    return Op;
4112
4113  // Check if this can be converted into a logical shift.
4114  bool isLeft = false;
4115  unsigned ShAmt = 0;
4116  SDValue ShVal;
4117  bool isShift = getSubtarget()->hasSSE2() &&
4118  isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4119  if (isShift && ShVal.hasOneUse()) {
4120    // If the shifted value has multiple uses, it may be cheaper to use
4121    // v_set0 + movlhps or movhlps, etc.
4122    MVT EVT = VT.getVectorElementType();
4123    ShAmt *= EVT.getSizeInBits();
4124    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4125  }
4126
4127  if (X86::isMOVLMask(SVOp)) {
4128    if (V1IsUndef)
4129      return V2;
4130    if (ISD::isBuildVectorAllZeros(V1.getNode()))
4131      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4132    if (!isMMX)
4133      return Op;
4134  }
4135
4136  // FIXME: fold these into legal mask.
4137  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4138                 X86::isMOVSLDUPMask(SVOp) ||
4139                 X86::isMOVHLPSMask(SVOp) ||
4140                 X86::isMOVHPMask(SVOp) ||
4141                 X86::isMOVLPMask(SVOp)))
4142    return Op;
4143
4144  if (ShouldXformToMOVHLPS(SVOp) ||
4145      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4146    return CommuteVectorShuffle(SVOp, DAG);
4147
4148  if (isShift) {
4149    // No better options. Use a vshl / vsrl.
4150    MVT EVT = VT.getVectorElementType();
4151    ShAmt *= EVT.getSizeInBits();
4152    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4153  }
4154
4155  bool Commuted = false;
4156  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4157  // 1,1,1,1 -> v8i16 though.
4158  V1IsSplat = isSplatVector(V1.getNode());
4159  V2IsSplat = isSplatVector(V2.getNode());
4160
4161  // Canonicalize the splat or undef, if present, to be on the RHS.
4162  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4163    Op = CommuteVectorShuffle(SVOp, DAG);
4164    SVOp = cast<ShuffleVectorSDNode>(Op);
4165    V1 = SVOp->getOperand(0);
4166    V2 = SVOp->getOperand(1);
4167    std::swap(V1IsSplat, V2IsSplat);
4168    std::swap(V1IsUndef, V2IsUndef);
4169    Commuted = true;
4170  }
4171
4172  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4173    // Shuffling low element of v1 into undef, just return v1.
4174    if (V2IsUndef)
4175      return V1;
4176    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4177    // the instruction selector will not match, so get a canonical MOVL with
4178    // swapped operands to undo the commute.
4179    return getMOVL(DAG, dl, VT, V2, V1);
4180  }
4181
4182  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4183      X86::isUNPCKH_v_undef_Mask(SVOp) ||
4184      X86::isUNPCKLMask(SVOp) ||
4185      X86::isUNPCKHMask(SVOp))
4186    return Op;
4187
4188  if (V2IsSplat) {
4189    // Normalize mask so all entries that point to V2 points to its first
4190    // element then try to match unpck{h|l} again. If match, return a
4191    // new vector_shuffle with the corrected mask.
4192    SDValue NewMask = NormalizeMask(SVOp, DAG);
4193    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4194    if (NSVOp != SVOp) {
4195      if (X86::isUNPCKLMask(NSVOp, true)) {
4196        return NewMask;
4197      } else if (X86::isUNPCKHMask(NSVOp, true)) {
4198        return NewMask;
4199      }
4200    }
4201  }
4202
4203  if (Commuted) {
4204    // Commute is back and try unpck* again.
4205    // FIXME: this seems wrong.
4206    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4207    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4208    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4209        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4210        X86::isUNPCKLMask(NewSVOp) ||
4211        X86::isUNPCKHMask(NewSVOp))
4212      return NewOp;
4213  }
4214
4215  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4216
4217  // Normalize the node to match x86 shuffle ops if needed
4218  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4219    return CommuteVectorShuffle(SVOp, DAG);
4220
4221  // Check for legal shuffle and return?
4222  SmallVector<int, 16> PermMask;
4223  SVOp->getMask(PermMask);
4224  if (isShuffleMaskLegal(PermMask, VT))
4225    return Op;
4226
4227  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4228  if (VT == MVT::v8i16) {
4229    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4230    if (NewOp.getNode())
4231      return NewOp;
4232  }
4233
4234  if (VT == MVT::v16i8) {
4235    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4236    if (NewOp.getNode())
4237      return NewOp;
4238  }
4239
4240  // Handle all 4 wide cases with a number of shuffles except for MMX.
4241  if (NumElems == 4 && !isMMX)
4242    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4243
4244  return SDValue();
4245}
4246
4247SDValue
4248X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4249                                                SelectionDAG &DAG) {
4250  MVT VT = Op.getValueType();
4251  DebugLoc dl = Op.getDebugLoc();
4252  if (VT.getSizeInBits() == 8) {
4253    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4254                                    Op.getOperand(0), Op.getOperand(1));
4255    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4256                                    DAG.getValueType(VT));
4257    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4258  } else if (VT.getSizeInBits() == 16) {
4259    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4260    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4261    if (Idx == 0)
4262      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4263                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4264                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4265                                                 MVT::v4i32,
4266                                                 Op.getOperand(0)),
4267                                     Op.getOperand(1)));
4268    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4269                                    Op.getOperand(0), Op.getOperand(1));
4270    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4271                                    DAG.getValueType(VT));
4272    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4273  } else if (VT == MVT::f32) {
4274    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4275    // the result back to FR32 register. It's only worth matching if the
4276    // result has a single use which is a store or a bitcast to i32.  And in
4277    // the case of a store, it's not worth it if the index is a constant 0,
4278    // because a MOVSSmr can be used instead, which is smaller and faster.
4279    if (!Op.hasOneUse())
4280      return SDValue();
4281    SDNode *User = *Op.getNode()->use_begin();
4282    if ((User->getOpcode() != ISD::STORE ||
4283         (isa<ConstantSDNode>(Op.getOperand(1)) &&
4284          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4285        (User->getOpcode() != ISD::BIT_CONVERT ||
4286         User->getValueType(0) != MVT::i32))
4287      return SDValue();
4288    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4289                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4290                                              Op.getOperand(0)),
4291                                              Op.getOperand(1));
4292    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4293  } else if (VT == MVT::i32) {
4294    // ExtractPS works with constant index.
4295    if (isa<ConstantSDNode>(Op.getOperand(1)))
4296      return Op;
4297  }
4298  return SDValue();
4299}
4300
4301
4302SDValue
4303X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4304  if (!isa<ConstantSDNode>(Op.getOperand(1)))
4305    return SDValue();
4306
4307  if (Subtarget->hasSSE41()) {
4308    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4309    if (Res.getNode())
4310      return Res;
4311  }
4312
4313  MVT VT = Op.getValueType();
4314  DebugLoc dl = Op.getDebugLoc();
4315  // TODO: handle v16i8.
4316  if (VT.getSizeInBits() == 16) {
4317    SDValue Vec = Op.getOperand(0);
4318    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4319    if (Idx == 0)
4320      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4321                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4322                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4323                                                 MVT::v4i32, Vec),
4324                                     Op.getOperand(1)));
4325    // Transform it so it match pextrw which produces a 32-bit result.
4326    MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
4327    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
4328                                    Op.getOperand(0), Op.getOperand(1));
4329    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
4330                                    DAG.getValueType(VT));
4331    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4332  } else if (VT.getSizeInBits() == 32) {
4333    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4334    if (Idx == 0)
4335      return Op;
4336
4337    // SHUFPS the element to the lowest double word, then movss.
4338    int Mask[4] = { Idx, -1, -1, -1 };
4339    MVT VVT = Op.getOperand(0).getValueType();
4340    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4341                                       DAG.getUNDEF(VVT), Mask);
4342    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4343                       DAG.getIntPtrConstant(0));
4344  } else if (VT.getSizeInBits() == 64) {
4345    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4346    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4347    //        to match extract_elt for f64.
4348    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4349    if (Idx == 0)
4350      return Op;
4351
4352    // UNPCKHPD the element to the lowest double word, then movsd.
4353    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4354    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4355    int Mask[2] = { 1, -1 };
4356    MVT VVT = Op.getOperand(0).getValueType();
4357    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4358                                       DAG.getUNDEF(VVT), Mask);
4359    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4360                       DAG.getIntPtrConstant(0));
4361  }
4362
4363  return SDValue();
4364}
4365
4366SDValue
4367X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4368  MVT VT = Op.getValueType();
4369  MVT EVT = VT.getVectorElementType();
4370  DebugLoc dl = Op.getDebugLoc();
4371
4372  SDValue N0 = Op.getOperand(0);
4373  SDValue N1 = Op.getOperand(1);
4374  SDValue N2 = Op.getOperand(2);
4375
4376  if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
4377      isa<ConstantSDNode>(N2)) {
4378    unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
4379                                              : X86ISD::PINSRW;
4380    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4381    // argument.
4382    if (N1.getValueType() != MVT::i32)
4383      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4384    if (N2.getValueType() != MVT::i32)
4385      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4386    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
4387  } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4388    // Bits [7:6] of the constant are the source select.  This will always be
4389    //  zero here.  The DAG Combiner may combine an extract_elt index into these
4390    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
4391    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
4392    // Bits [5:4] of the constant are the destination select.  This is the
4393    //  value of the incoming immediate.
4394    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
4395    //   combine either bitwise AND or insert of float 0.0 to set these bits.
4396    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4397    // Create this as a scalar to vector..
4398    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
4399    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
4400  } else if (EVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
4401    // PINSR* works with constant index.
4402    return Op;
4403  }
4404  return SDValue();
4405}
4406
4407SDValue
4408X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4409  MVT VT = Op.getValueType();
4410  MVT EVT = VT.getVectorElementType();
4411
4412  if (Subtarget->hasSSE41())
4413    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4414
4415  if (EVT == MVT::i8)
4416    return SDValue();
4417
4418  DebugLoc dl = Op.getDebugLoc();
4419  SDValue N0 = Op.getOperand(0);
4420  SDValue N1 = Op.getOperand(1);
4421  SDValue N2 = Op.getOperand(2);
4422
4423  if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
4424    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
4425    // as its second argument.
4426    if (N1.getValueType() != MVT::i32)
4427      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4428    if (N2.getValueType() != MVT::i32)
4429      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4430    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
4431  }
4432  return SDValue();
4433}
4434
4435SDValue
4436X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
4437  DebugLoc dl = Op.getDebugLoc();
4438  if (Op.getValueType() == MVT::v2f32)
4439    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
4440                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
4441                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
4442                                               Op.getOperand(0))));
4443
4444  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
4445    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
4446
4447  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
4448  MVT VT = MVT::v2i32;
4449  switch (Op.getValueType().getSimpleVT()) {
4450  default: break;
4451  case MVT::v16i8:
4452  case MVT::v8i16:
4453    VT = MVT::v4i32;
4454    break;
4455  }
4456  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
4457                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
4458}
4459
4460// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
4461// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
4462// one of the above mentioned nodes. It has to be wrapped because otherwise
4463// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
4464// be used to form addressing mode. These wrapped nodes will be selected
4465// into MOV32ri.
4466SDValue
4467X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
4468  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4469
4470  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4471  // global base reg.
4472  unsigned char OpFlag = 0;
4473  unsigned WrapperKind = X86ISD::Wrapper;
4474  CodeModel::Model M = getTargetMachine().getCodeModel();
4475
4476  if (Subtarget->isPICStyleRIPRel() &&
4477      (M == CodeModel::Small || M == CodeModel::Kernel))
4478    WrapperKind = X86ISD::WrapperRIP;
4479  else if (Subtarget->isPICStyleGOT())
4480    OpFlag = X86II::MO_GOTOFF;
4481  else if (Subtarget->isPICStyleStubPIC())
4482    OpFlag = X86II::MO_PIC_BASE_OFFSET;
4483
4484  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
4485                                             CP->getAlignment(),
4486                                             CP->getOffset(), OpFlag);
4487  DebugLoc DL = CP->getDebugLoc();
4488  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4489  // With PIC, the address is actually $g + Offset.
4490  if (OpFlag) {
4491    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4492                         DAG.getNode(X86ISD::GlobalBaseReg,
4493                                     DebugLoc::getUnknownLoc(), getPointerTy()),
4494                         Result);
4495  }
4496
4497  return Result;
4498}
4499
4500SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
4501  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4502
4503  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4504  // global base reg.
4505  unsigned char OpFlag = 0;
4506  unsigned WrapperKind = X86ISD::Wrapper;
4507  CodeModel::Model M = getTargetMachine().getCodeModel();
4508
4509  if (Subtarget->isPICStyleRIPRel() &&
4510      (M == CodeModel::Small || M == CodeModel::Kernel))
4511    WrapperKind = X86ISD::WrapperRIP;
4512  else if (Subtarget->isPICStyleGOT())
4513    OpFlag = X86II::MO_GOTOFF;
4514  else if (Subtarget->isPICStyleStubPIC())
4515    OpFlag = X86II::MO_PIC_BASE_OFFSET;
4516
4517  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
4518                                          OpFlag);
4519  DebugLoc DL = JT->getDebugLoc();
4520  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4521
4522  // With PIC, the address is actually $g + Offset.
4523  if (OpFlag) {
4524    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4525                         DAG.getNode(X86ISD::GlobalBaseReg,
4526                                     DebugLoc::getUnknownLoc(), getPointerTy()),
4527                         Result);
4528  }
4529
4530  return Result;
4531}
4532
4533SDValue
4534X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
4535  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
4536
4537  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4538  // global base reg.
4539  unsigned char OpFlag = 0;
4540  unsigned WrapperKind = X86ISD::Wrapper;
4541  CodeModel::Model M = getTargetMachine().getCodeModel();
4542
4543  if (Subtarget->isPICStyleRIPRel() &&
4544      (M == CodeModel::Small || M == CodeModel::Kernel))
4545    WrapperKind = X86ISD::WrapperRIP;
4546  else if (Subtarget->isPICStyleGOT())
4547    OpFlag = X86II::MO_GOTOFF;
4548  else if (Subtarget->isPICStyleStubPIC())
4549    OpFlag = X86II::MO_PIC_BASE_OFFSET;
4550
4551  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
4552
4553  DebugLoc DL = Op.getDebugLoc();
4554  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4555
4556
4557  // With PIC, the address is actually $g + Offset.
4558  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4559      !Subtarget->is64Bit()) {
4560    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4561                         DAG.getNode(X86ISD::GlobalBaseReg,
4562                                     DebugLoc::getUnknownLoc(),
4563                                     getPointerTy()),
4564                         Result);
4565  }
4566
4567  return Result;
4568}
4569
4570SDValue
4571X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
4572                                      int64_t Offset,
4573                                      SelectionDAG &DAG) const {
4574  // Create the TargetGlobalAddress node, folding in the constant
4575  // offset if it is legal.
4576  unsigned char OpFlags =
4577    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
4578  CodeModel::Model M = getTargetMachine().getCodeModel();
4579  SDValue Result;
4580  if (OpFlags == X86II::MO_NO_FLAG &&
4581      X86::isOffsetSuitableForCodeModel(Offset, M)) {
4582    // A direct static reference to a global.
4583    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
4584    Offset = 0;
4585  } else {
4586    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
4587  }
4588
4589  if (Subtarget->isPICStyleRIPRel() &&
4590      (M == CodeModel::Small || M == CodeModel::Kernel))
4591    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
4592  else
4593    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4594
4595  // With PIC, the address is actually $g + Offset.
4596  if (isGlobalRelativeToPICBase(OpFlags)) {
4597    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4598                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
4599                         Result);
4600  }
4601
4602  // For globals that require a load from a stub to get the address, emit the
4603  // load.
4604  if (isGlobalStubReference(OpFlags))
4605    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
4606                         PseudoSourceValue::getGOT(), 0);
4607
4608  // If there was a non-zero offset that we didn't fold, create an explicit
4609  // addition for it.
4610  if (Offset != 0)
4611    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
4612                         DAG.getConstant(Offset, getPointerTy()));
4613
4614  return Result;
4615}
4616
4617SDValue
4618X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
4619  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4620  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
4621  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
4622}
4623
4624static SDValue
4625GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
4626           SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg,
4627           unsigned char OperandFlags) {
4628  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4629  DebugLoc dl = GA->getDebugLoc();
4630  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4631                                           GA->getValueType(0),
4632                                           GA->getOffset(),
4633                                           OperandFlags);
4634  if (InFlag) {
4635    SDValue Ops[] = { Chain,  TGA, *InFlag };
4636    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
4637  } else {
4638    SDValue Ops[]  = { Chain, TGA };
4639    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
4640  }
4641  SDValue Flag = Chain.getValue(1);
4642  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
4643}
4644
4645// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
4646static SDValue
4647LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4648                                const MVT PtrVT) {
4649  SDValue InFlag;
4650  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
4651  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
4652                                     DAG.getNode(X86ISD::GlobalBaseReg,
4653                                                 DebugLoc::getUnknownLoc(),
4654                                                 PtrVT), InFlag);
4655  InFlag = Chain.getValue(1);
4656
4657  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
4658}
4659
4660// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
4661static SDValue
4662LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4663                                const MVT PtrVT) {
4664  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
4665                    X86::RAX, X86II::MO_TLSGD);
4666}
4667
4668// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
4669// "local exec" model.
4670static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4671                                   const MVT PtrVT, TLSModel::Model model,
4672                                   bool is64Bit) {
4673  DebugLoc dl = GA->getDebugLoc();
4674  // Get the Thread Pointer
4675  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
4676                             DebugLoc::getUnknownLoc(), PtrVT,
4677                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
4678                                             MVT::i32));
4679
4680  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
4681                                      NULL, 0);
4682
4683  unsigned char OperandFlags = 0;
4684  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
4685  // initialexec.
4686  unsigned WrapperKind = X86ISD::Wrapper;
4687  if (model == TLSModel::LocalExec) {
4688    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
4689  } else if (is64Bit) {
4690    assert(model == TLSModel::InitialExec);
4691    OperandFlags = X86II::MO_GOTTPOFF;
4692    WrapperKind = X86ISD::WrapperRIP;
4693  } else {
4694    assert(model == TLSModel::InitialExec);
4695    OperandFlags = X86II::MO_INDNTPOFF;
4696  }
4697
4698  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
4699  // exec)
4700  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
4701                                           GA->getOffset(), OperandFlags);
4702  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
4703
4704  if (model == TLSModel::InitialExec)
4705    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
4706                         PseudoSourceValue::getGOT(), 0);
4707
4708  // The address of the thread local variable is the add of the thread
4709  // pointer with the offset of the variable.
4710  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
4711}
4712
4713SDValue
4714X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
4715  // TODO: implement the "local dynamic" model
4716  // TODO: implement the "initial exec"model for pic executables
4717  assert(Subtarget->isTargetELF() &&
4718         "TLS not implemented for non-ELF targets");
4719  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
4720  const GlobalValue *GV = GA->getGlobal();
4721
4722  // If GV is an alias then use the aliasee for determining
4723  // thread-localness.
4724  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
4725    GV = GA->resolveAliasedGlobal(false);
4726
4727  TLSModel::Model model = getTLSModel(GV,
4728                                      getTargetMachine().getRelocationModel());
4729
4730  switch (model) {
4731  case TLSModel::GeneralDynamic:
4732  case TLSModel::LocalDynamic: // not implemented
4733    if (Subtarget->is64Bit())
4734      return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
4735    return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
4736
4737  case TLSModel::InitialExec:
4738  case TLSModel::LocalExec:
4739    return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
4740                               Subtarget->is64Bit());
4741  }
4742
4743  llvm_unreachable("Unreachable");
4744  return SDValue();
4745}
4746
4747
4748/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
4749/// take a 2 x i32 value to shift plus a shift amount.
4750SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
4751  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4752  MVT VT = Op.getValueType();
4753  unsigned VTBits = VT.getSizeInBits();
4754  DebugLoc dl = Op.getDebugLoc();
4755  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
4756  SDValue ShOpLo = Op.getOperand(0);
4757  SDValue ShOpHi = Op.getOperand(1);
4758  SDValue ShAmt  = Op.getOperand(2);
4759  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
4760                                     DAG.getConstant(VTBits - 1, MVT::i8))
4761                       : DAG.getConstant(0, VT);
4762
4763  SDValue Tmp2, Tmp3;
4764  if (Op.getOpcode() == ISD::SHL_PARTS) {
4765    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
4766    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4767  } else {
4768    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
4769    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
4770  }
4771
4772  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
4773                                DAG.getConstant(VTBits, MVT::i8));
4774  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
4775                             AndNode, DAG.getConstant(0, MVT::i8));
4776
4777  SDValue Hi, Lo;
4778  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
4779  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
4780  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
4781
4782  if (Op.getOpcode() == ISD::SHL_PARTS) {
4783    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
4784    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
4785  } else {
4786    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
4787    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
4788  }
4789
4790  SDValue Ops[2] = { Lo, Hi };
4791  return DAG.getMergeValues(Ops, 2, dl);
4792}
4793
4794SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4795  MVT SrcVT = Op.getOperand(0).getValueType();
4796
4797  if (SrcVT.isVector()) {
4798    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
4799      return Op;
4800    }
4801    return SDValue();
4802  }
4803
4804  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
4805         "Unknown SINT_TO_FP to lower!");
4806
4807  // These are really Legal; return the operand so the caller accepts it as
4808  // Legal.
4809  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
4810    return Op;
4811  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
4812      Subtarget->is64Bit()) {
4813    return Op;
4814  }
4815
4816  DebugLoc dl = Op.getDebugLoc();
4817  unsigned Size = SrcVT.getSizeInBits()/8;
4818  MachineFunction &MF = DAG.getMachineFunction();
4819  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
4820  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4821  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
4822                               StackSlot,
4823                               PseudoSourceValue::getFixedStack(SSFI), 0);
4824  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
4825}
4826
4827SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain,
4828                                     SDValue StackSlot,
4829                                     SelectionDAG &DAG) {
4830  // Build the FILD
4831  DebugLoc dl = Op.getDebugLoc();
4832  SDVTList Tys;
4833  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
4834  if (useSSE)
4835    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
4836  else
4837    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
4838  SmallVector<SDValue, 8> Ops;
4839  Ops.push_back(Chain);
4840  Ops.push_back(StackSlot);
4841  Ops.push_back(DAG.getValueType(SrcVT));
4842  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
4843                                 Tys, &Ops[0], Ops.size());
4844
4845  if (useSSE) {
4846    Chain = Result.getValue(1);
4847    SDValue InFlag = Result.getValue(2);
4848
4849    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
4850    // shouldn't be necessary except that RFP cannot be live across
4851    // multiple blocks. When stackifier is fixed, they can be uncoupled.
4852    MachineFunction &MF = DAG.getMachineFunction();
4853    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
4854    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4855    Tys = DAG.getVTList(MVT::Other);
4856    SmallVector<SDValue, 8> Ops;
4857    Ops.push_back(Chain);
4858    Ops.push_back(Result);
4859    Ops.push_back(StackSlot);
4860    Ops.push_back(DAG.getValueType(Op.getValueType()));
4861    Ops.push_back(InFlag);
4862    Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size());
4863    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
4864                         PseudoSourceValue::getFixedStack(SSFI), 0);
4865  }
4866
4867  return Result;
4868}
4869
4870// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
4871SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
4872  // This algorithm is not obvious. Here it is in C code, more or less:
4873  /*
4874    double uint64_to_double( uint32_t hi, uint32_t lo ) {
4875      static const __m128i exp = { 0x4330000045300000ULL, 0 };
4876      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
4877
4878      // Copy ints to xmm registers.
4879      __m128i xh = _mm_cvtsi32_si128( hi );
4880      __m128i xl = _mm_cvtsi32_si128( lo );
4881
4882      // Combine into low half of a single xmm register.
4883      __m128i x = _mm_unpacklo_epi32( xh, xl );
4884      __m128d d;
4885      double sd;
4886
4887      // Merge in appropriate exponents to give the integer bits the right
4888      // magnitude.
4889      x = _mm_unpacklo_epi32( x, exp );
4890
4891      // Subtract away the biases to deal with the IEEE-754 double precision
4892      // implicit 1.
4893      d = _mm_sub_pd( (__m128d) x, bias );
4894
4895      // All conversions up to here are exact. The correctly rounded result is
4896      // calculated using the current rounding mode using the following
4897      // horizontal add.
4898      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
4899      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
4900                                // store doesn't really need to be here (except
4901                                // maybe to zero the other double)
4902      return sd;
4903    }
4904  */
4905
4906  DebugLoc dl = Op.getDebugLoc();
4907  LLVMContext *Context = DAG.getContext();
4908
4909  // Build some magic constants.
4910  std::vector<Constant*> CV0;
4911  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
4912  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
4913  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
4914  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
4915  Constant *C0 = ConstantVector::get(CV0);
4916  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
4917
4918  std::vector<Constant*> CV1;
4919  CV1.push_back(
4920    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
4921  CV1.push_back(
4922    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
4923  Constant *C1 = ConstantVector::get(CV1);
4924  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
4925
4926  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4927                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4928                                        Op.getOperand(0),
4929                                        DAG.getIntPtrConstant(1)));
4930  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4931                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4932                                        Op.getOperand(0),
4933                                        DAG.getIntPtrConstant(0)));
4934  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
4935  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
4936                              PseudoSourceValue::getConstantPool(), 0,
4937                              false, 16);
4938  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
4939  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
4940  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
4941                              PseudoSourceValue::getConstantPool(), 0,
4942                              false, 16);
4943  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
4944
4945  // Add the halves; easiest way is to swap them into another reg first.
4946  int ShufMask[2] = { 1, -1 };
4947  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
4948                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
4949  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
4950  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
4951                     DAG.getIntPtrConstant(0));
4952}
4953
4954// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
4955SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
4956  DebugLoc dl = Op.getDebugLoc();
4957  // FP constant to bias correct the final result.
4958  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
4959                                   MVT::f64);
4960
4961  // Load the 32-bit value into an XMM register.
4962  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4963                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4964                                         Op.getOperand(0),
4965                                         DAG.getIntPtrConstant(0)));
4966
4967  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
4968                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
4969                     DAG.getIntPtrConstant(0));
4970
4971  // Or the load with the bias.
4972  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
4973                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
4974                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4975                                                   MVT::v2f64, Load)),
4976                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
4977                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4978                                                   MVT::v2f64, Bias)));
4979  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
4980                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
4981                   DAG.getIntPtrConstant(0));
4982
4983  // Subtract the bias.
4984  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
4985
4986  // Handle final rounding.
4987  MVT DestVT = Op.getValueType();
4988
4989  if (DestVT.bitsLT(MVT::f64)) {
4990    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
4991                       DAG.getIntPtrConstant(0));
4992  } else if (DestVT.bitsGT(MVT::f64)) {
4993    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
4994  }
4995
4996  // Handle final rounding.
4997  return Sub;
4998}
4999
5000SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5001  SDValue N0 = Op.getOperand(0);
5002  DebugLoc dl = Op.getDebugLoc();
5003
5004  // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
5005  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5006  // the optimization here.
5007  if (DAG.SignBitIsZero(N0))
5008    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5009
5010  MVT SrcVT = N0.getValueType();
5011  if (SrcVT == MVT::i64) {
5012    // We only handle SSE2 f64 target here; caller can expand the rest.
5013    if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
5014      return SDValue();
5015
5016    return LowerUINT_TO_FP_i64(Op, DAG);
5017  } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
5018    return LowerUINT_TO_FP_i32(Op, DAG);
5019  }
5020
5021  assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
5022
5023  // Make a 64-bit buffer, and use it to build an FILD.
5024  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5025  SDValue WordOff = DAG.getConstant(4, getPointerTy());
5026  SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5027                                   getPointerTy(), StackSlot, WordOff);
5028  SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5029                                StackSlot, NULL, 0);
5030  SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5031                                OffsetSlot, NULL, 0);
5032  return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5033}
5034
5035std::pair<SDValue,SDValue> X86TargetLowering::
5036FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
5037  DebugLoc dl = Op.getDebugLoc();
5038
5039  MVT DstTy = Op.getValueType();
5040
5041  if (!IsSigned) {
5042    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5043    DstTy = MVT::i64;
5044  }
5045
5046  assert(DstTy.getSimpleVT() <= MVT::i64 &&
5047         DstTy.getSimpleVT() >= MVT::i16 &&
5048         "Unknown FP_TO_SINT to lower!");
5049
5050  // These are really Legal.
5051  if (DstTy == MVT::i32 &&
5052      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5053    return std::make_pair(SDValue(), SDValue());
5054  if (Subtarget->is64Bit() &&
5055      DstTy == MVT::i64 &&
5056      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5057    return std::make_pair(SDValue(), SDValue());
5058
5059  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5060  // stack slot.
5061  MachineFunction &MF = DAG.getMachineFunction();
5062  unsigned MemSize = DstTy.getSizeInBits()/8;
5063  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
5064  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5065
5066  unsigned Opc;
5067  switch (DstTy.getSimpleVT()) {
5068  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5069  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5070  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5071  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5072  }
5073
5074  SDValue Chain = DAG.getEntryNode();
5075  SDValue Value = Op.getOperand(0);
5076  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5077    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5078    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5079                         PseudoSourceValue::getFixedStack(SSFI), 0);
5080    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5081    SDValue Ops[] = {
5082      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5083    };
5084    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5085    Chain = Value.getValue(1);
5086    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
5087    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5088  }
5089
5090  // Build the FP_TO_INT*_IN_MEM
5091  SDValue Ops[] = { Chain, Value, StackSlot };
5092  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5093
5094  return std::make_pair(FIST, StackSlot);
5095}
5096
5097SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
5098  if (Op.getValueType().isVector()) {
5099    if (Op.getValueType() == MVT::v2i32 &&
5100        Op.getOperand(0).getValueType() == MVT::v2f64) {
5101      return Op;
5102    }
5103    return SDValue();
5104  }
5105
5106  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5107  SDValue FIST = Vals.first, StackSlot = Vals.second;
5108  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5109  if (FIST.getNode() == 0) return Op;
5110
5111  // Load the result.
5112  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5113                     FIST, StackSlot, NULL, 0);
5114}
5115
5116SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
5117  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5118  SDValue FIST = Vals.first, StackSlot = Vals.second;
5119  assert(FIST.getNode() && "Unexpected failure");
5120
5121  // Load the result.
5122  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5123                     FIST, StackSlot, NULL, 0);
5124}
5125
5126SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
5127  LLVMContext *Context = DAG.getContext();
5128  DebugLoc dl = Op.getDebugLoc();
5129  MVT VT = Op.getValueType();
5130  MVT EltVT = VT;
5131  if (VT.isVector())
5132    EltVT = VT.getVectorElementType();
5133  std::vector<Constant*> CV;
5134  if (EltVT == MVT::f64) {
5135    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5136    CV.push_back(C);
5137    CV.push_back(C);
5138  } else {
5139    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5140    CV.push_back(C);
5141    CV.push_back(C);
5142    CV.push_back(C);
5143    CV.push_back(C);
5144  }
5145  Constant *C = ConstantVector::get(CV);
5146  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5147  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5148                               PseudoSourceValue::getConstantPool(), 0,
5149                               false, 16);
5150  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5151}
5152
5153SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
5154  LLVMContext *Context = DAG.getContext();
5155  DebugLoc dl = Op.getDebugLoc();
5156  MVT VT = Op.getValueType();
5157  MVT EltVT = VT;
5158  unsigned EltNum = 1;
5159  if (VT.isVector()) {
5160    EltVT = VT.getVectorElementType();
5161    EltNum = VT.getVectorNumElements();
5162  }
5163  std::vector<Constant*> CV;
5164  if (EltVT == MVT::f64) {
5165    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5166    CV.push_back(C);
5167    CV.push_back(C);
5168  } else {
5169    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5170    CV.push_back(C);
5171    CV.push_back(C);
5172    CV.push_back(C);
5173    CV.push_back(C);
5174  }
5175  Constant *C = ConstantVector::get(CV);
5176  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5177  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5178                               PseudoSourceValue::getConstantPool(), 0,
5179                               false, 16);
5180  if (VT.isVector()) {
5181    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5182                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5183                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5184                                Op.getOperand(0)),
5185                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5186  } else {
5187    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5188  }
5189}
5190
5191SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
5192  LLVMContext *Context = DAG.getContext();
5193  SDValue Op0 = Op.getOperand(0);
5194  SDValue Op1 = Op.getOperand(1);
5195  DebugLoc dl = Op.getDebugLoc();
5196  MVT VT = Op.getValueType();
5197  MVT SrcVT = Op1.getValueType();
5198
5199  // If second operand is smaller, extend it first.
5200  if (SrcVT.bitsLT(VT)) {
5201    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5202    SrcVT = VT;
5203  }
5204  // And if it is bigger, shrink it first.
5205  if (SrcVT.bitsGT(VT)) {
5206    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5207    SrcVT = VT;
5208  }
5209
5210  // At this point the operands and the result should have the same
5211  // type, and that won't be f80 since that is not custom lowered.
5212
5213  // First get the sign bit of second operand.
5214  std::vector<Constant*> CV;
5215  if (SrcVT == MVT::f64) {
5216    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5217    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5218  } else {
5219    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5220    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5221    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5222    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5223  }
5224  Constant *C = ConstantVector::get(CV);
5225  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5226  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5227                                PseudoSourceValue::getConstantPool(), 0,
5228                                false, 16);
5229  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5230
5231  // Shift sign bit right or left if the two operands have different types.
5232  if (SrcVT.bitsGT(VT)) {
5233    // Op0 is MVT::f32, Op1 is MVT::f64.
5234    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5235    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5236                          DAG.getConstant(32, MVT::i32));
5237    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5238    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5239                          DAG.getIntPtrConstant(0));
5240  }
5241
5242  // Clear first operand sign bit.
5243  CV.clear();
5244  if (VT == MVT::f64) {
5245    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
5246    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5247  } else {
5248    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
5249    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5250    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5251    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5252  }
5253  C = ConstantVector::get(CV);
5254  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5255  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5256                                PseudoSourceValue::getConstantPool(), 0,
5257                                false, 16);
5258  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
5259
5260  // Or the value with the sign bit.
5261  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
5262}
5263
5264/// Emit nodes that will be selected as "test Op0,Op0", or something
5265/// equivalent.
5266SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
5267                                    SelectionDAG &DAG) {
5268  DebugLoc dl = Op.getDebugLoc();
5269
5270  // CF and OF aren't always set the way we want. Determine which
5271  // of these we need.
5272  bool NeedCF = false;
5273  bool NeedOF = false;
5274  switch (X86CC) {
5275  case X86::COND_A: case X86::COND_AE:
5276  case X86::COND_B: case X86::COND_BE:
5277    NeedCF = true;
5278    break;
5279  case X86::COND_G: case X86::COND_GE:
5280  case X86::COND_L: case X86::COND_LE:
5281  case X86::COND_O: case X86::COND_NO:
5282    NeedOF = true;
5283    break;
5284  default: break;
5285  }
5286
5287  // See if we can use the EFLAGS value from the operand instead of
5288  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
5289  // we prove that the arithmetic won't overflow, we can't use OF or CF.
5290  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
5291    unsigned Opcode = 0;
5292    unsigned NumOperands = 0;
5293    switch (Op.getNode()->getOpcode()) {
5294    case ISD::ADD:
5295      // Due to an isel shortcoming, be conservative if this add is likely to
5296      // be selected as part of a load-modify-store instruction. When the root
5297      // node in a match is a store, isel doesn't know how to remap non-chain
5298      // non-flag uses of other nodes in the match, such as the ADD in this
5299      // case. This leads to the ADD being left around and reselected, with
5300      // the result being two adds in the output.
5301      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5302           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5303        if (UI->getOpcode() == ISD::STORE)
5304          goto default_case;
5305      if (ConstantSDNode *C =
5306            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
5307        // An add of one will be selected as an INC.
5308        if (C->getAPIntValue() == 1) {
5309          Opcode = X86ISD::INC;
5310          NumOperands = 1;
5311          break;
5312        }
5313        // An add of negative one (subtract of one) will be selected as a DEC.
5314        if (C->getAPIntValue().isAllOnesValue()) {
5315          Opcode = X86ISD::DEC;
5316          NumOperands = 1;
5317          break;
5318        }
5319      }
5320      // Otherwise use a regular EFLAGS-setting add.
5321      Opcode = X86ISD::ADD;
5322      NumOperands = 2;
5323      break;
5324    case ISD::SUB:
5325      // Due to the ISEL shortcoming noted above, be conservative if this sub is
5326      // likely to be selected as part of a load-modify-store instruction.
5327      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5328           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5329        if (UI->getOpcode() == ISD::STORE)
5330          goto default_case;
5331      // Otherwise use a regular EFLAGS-setting sub.
5332      Opcode = X86ISD::SUB;
5333      NumOperands = 2;
5334      break;
5335    case X86ISD::ADD:
5336    case X86ISD::SUB:
5337    case X86ISD::INC:
5338    case X86ISD::DEC:
5339      return SDValue(Op.getNode(), 1);
5340    default:
5341    default_case:
5342      break;
5343    }
5344    if (Opcode != 0) {
5345      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5346      SmallVector<SDValue, 4> Ops;
5347      for (unsigned i = 0; i != NumOperands; ++i)
5348        Ops.push_back(Op.getOperand(i));
5349      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
5350      DAG.ReplaceAllUsesWith(Op, New);
5351      return SDValue(New.getNode(), 1);
5352    }
5353  }
5354
5355  // Otherwise just emit a CMP with 0, which is the TEST pattern.
5356  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
5357                     DAG.getConstant(0, Op.getValueType()));
5358}
5359
5360/// Emit nodes that will be selected as "cmp Op0,Op1", or something
5361/// equivalent.
5362SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
5363                                   SelectionDAG &DAG) {
5364  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
5365    if (C->getAPIntValue() == 0)
5366      return EmitTest(Op0, X86CC, DAG);
5367
5368  DebugLoc dl = Op0.getDebugLoc();
5369  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
5370}
5371
5372SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
5373  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
5374  SDValue Op0 = Op.getOperand(0);
5375  SDValue Op1 = Op.getOperand(1);
5376  DebugLoc dl = Op.getDebugLoc();
5377  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
5378
5379  // Lower (X & (1 << N)) == 0 to BT(X, N).
5380  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
5381  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
5382  if (Op0.getOpcode() == ISD::AND &&
5383      Op0.hasOneUse() &&
5384      Op1.getOpcode() == ISD::Constant &&
5385      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
5386      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5387    SDValue LHS, RHS;
5388    if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
5389      if (ConstantSDNode *Op010C =
5390            dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
5391        if (Op010C->getZExtValue() == 1) {
5392          LHS = Op0.getOperand(0);
5393          RHS = Op0.getOperand(1).getOperand(1);
5394        }
5395    } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
5396      if (ConstantSDNode *Op000C =
5397            dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
5398        if (Op000C->getZExtValue() == 1) {
5399          LHS = Op0.getOperand(1);
5400          RHS = Op0.getOperand(0).getOperand(1);
5401        }
5402    } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
5403      ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
5404      SDValue AndLHS = Op0.getOperand(0);
5405      if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
5406        LHS = AndLHS.getOperand(0);
5407        RHS = AndLHS.getOperand(1);
5408      }
5409    }
5410
5411    if (LHS.getNode()) {
5412      // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
5413      // instruction.  Since the shift amount is in-range-or-undefined, we know
5414      // that doing a bittest on the i16 value is ok.  We extend to i32 because
5415      // the encoding for the i16 version is larger than the i32 version.
5416      if (LHS.getValueType() == MVT::i8)
5417        LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
5418
5419      // If the operand types disagree, extend the shift amount to match.  Since
5420      // BT ignores high bits (like shifts) we can use anyextend.
5421      if (LHS.getValueType() != RHS.getValueType())
5422        RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
5423
5424      SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
5425      unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
5426      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5427                         DAG.getConstant(Cond, MVT::i8), BT);
5428    }
5429  }
5430
5431  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5432  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5433
5434  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
5435  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5436                     DAG.getConstant(X86CC, MVT::i8), Cond);
5437}
5438
5439SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5440  SDValue Cond;
5441  SDValue Op0 = Op.getOperand(0);
5442  SDValue Op1 = Op.getOperand(1);
5443  SDValue CC = Op.getOperand(2);
5444  MVT VT = Op.getValueType();
5445  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5446  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5447  DebugLoc dl = Op.getDebugLoc();
5448
5449  if (isFP) {
5450    unsigned SSECC = 8;
5451    MVT VT0 = Op0.getValueType();
5452    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
5453    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
5454    bool Swap = false;
5455
5456    switch (SetCCOpcode) {
5457    default: break;
5458    case ISD::SETOEQ:
5459    case ISD::SETEQ:  SSECC = 0; break;
5460    case ISD::SETOGT:
5461    case ISD::SETGT: Swap = true; // Fallthrough
5462    case ISD::SETLT:
5463    case ISD::SETOLT: SSECC = 1; break;
5464    case ISD::SETOGE:
5465    case ISD::SETGE: Swap = true; // Fallthrough
5466    case ISD::SETLE:
5467    case ISD::SETOLE: SSECC = 2; break;
5468    case ISD::SETUO:  SSECC = 3; break;
5469    case ISD::SETUNE:
5470    case ISD::SETNE:  SSECC = 4; break;
5471    case ISD::SETULE: Swap = true;
5472    case ISD::SETUGE: SSECC = 5; break;
5473    case ISD::SETULT: Swap = true;
5474    case ISD::SETUGT: SSECC = 6; break;
5475    case ISD::SETO:   SSECC = 7; break;
5476    }
5477    if (Swap)
5478      std::swap(Op0, Op1);
5479
5480    // In the two special cases we can't handle, emit two comparisons.
5481    if (SSECC == 8) {
5482      if (SetCCOpcode == ISD::SETUEQ) {
5483        SDValue UNORD, EQ;
5484        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
5485        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
5486        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
5487      }
5488      else if (SetCCOpcode == ISD::SETONE) {
5489        SDValue ORD, NEQ;
5490        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
5491        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
5492        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
5493      }
5494      llvm_unreachable("Illegal FP comparison");
5495    }
5496    // Handle all other FP comparisons here.
5497    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
5498  }
5499
5500  // We are handling one of the integer comparisons here.  Since SSE only has
5501  // GT and EQ comparisons for integer, swapping operands and multiple
5502  // operations may be required for some comparisons.
5503  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
5504  bool Swap = false, Invert = false, FlipSigns = false;
5505
5506  switch (VT.getSimpleVT()) {
5507  default: break;
5508  case MVT::v8i8:
5509  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
5510  case MVT::v4i16:
5511  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
5512  case MVT::v2i32:
5513  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
5514  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
5515  }
5516
5517  switch (SetCCOpcode) {
5518  default: break;
5519  case ISD::SETNE:  Invert = true;
5520  case ISD::SETEQ:  Opc = EQOpc; break;
5521  case ISD::SETLT:  Swap = true;
5522  case ISD::SETGT:  Opc = GTOpc; break;
5523  case ISD::SETGE:  Swap = true;
5524  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
5525  case ISD::SETULT: Swap = true;
5526  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
5527  case ISD::SETUGE: Swap = true;
5528  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
5529  }
5530  if (Swap)
5531    std::swap(Op0, Op1);
5532
5533  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
5534  // bits of the inputs before performing those operations.
5535  if (FlipSigns) {
5536    MVT EltVT = VT.getVectorElementType();
5537    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
5538                                      EltVT);
5539    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
5540    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
5541                                    SignBits.size());
5542    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
5543    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
5544  }
5545
5546  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
5547
5548  // If the logical-not of the result is required, perform that now.
5549  if (Invert)
5550    Result = DAG.getNOT(dl, Result, VT);
5551
5552  return Result;
5553}
5554
5555// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
5556static bool isX86LogicalCmp(SDValue Op) {
5557  unsigned Opc = Op.getNode()->getOpcode();
5558  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
5559    return true;
5560  if (Op.getResNo() == 1 &&
5561      (Opc == X86ISD::ADD ||
5562       Opc == X86ISD::SUB ||
5563       Opc == X86ISD::SMUL ||
5564       Opc == X86ISD::UMUL ||
5565       Opc == X86ISD::INC ||
5566       Opc == X86ISD::DEC))
5567    return true;
5568
5569  return false;
5570}
5571
5572SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
5573  bool addTest = true;
5574  SDValue Cond  = Op.getOperand(0);
5575  DebugLoc dl = Op.getDebugLoc();
5576  SDValue CC;
5577
5578  if (Cond.getOpcode() == ISD::SETCC)
5579    Cond = LowerSETCC(Cond, DAG);
5580
5581  // If condition flag is set by a X86ISD::CMP, then use it as the condition
5582  // setting operand in place of the X86ISD::SETCC.
5583  if (Cond.getOpcode() == X86ISD::SETCC) {
5584    CC = Cond.getOperand(0);
5585
5586    SDValue Cmp = Cond.getOperand(1);
5587    unsigned Opc = Cmp.getOpcode();
5588    MVT VT = Op.getValueType();
5589
5590    bool IllegalFPCMov = false;
5591    if (VT.isFloatingPoint() && !VT.isVector() &&
5592        !isScalarFPTypeInSSEReg(VT))  // FPStack?
5593      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
5594
5595    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
5596        Opc == X86ISD::BT) { // FIXME
5597      Cond = Cmp;
5598      addTest = false;
5599    }
5600  }
5601
5602  if (addTest) {
5603    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5604    Cond = EmitTest(Cond, X86::COND_NE, DAG);
5605  }
5606
5607  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
5608  SmallVector<SDValue, 4> Ops;
5609  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
5610  // condition is true.
5611  Ops.push_back(Op.getOperand(2));
5612  Ops.push_back(Op.getOperand(1));
5613  Ops.push_back(CC);
5614  Ops.push_back(Cond);
5615  return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size());
5616}
5617
5618// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
5619// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
5620// from the AND / OR.
5621static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
5622  Opc = Op.getOpcode();
5623  if (Opc != ISD::OR && Opc != ISD::AND)
5624    return false;
5625  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5626          Op.getOperand(0).hasOneUse() &&
5627          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
5628          Op.getOperand(1).hasOneUse());
5629}
5630
5631// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
5632// 1 and that the SETCC node has a single use.
5633static bool isXor1OfSetCC(SDValue Op) {
5634  if (Op.getOpcode() != ISD::XOR)
5635    return false;
5636  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5637  if (N1C && N1C->getAPIntValue() == 1) {
5638    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5639      Op.getOperand(0).hasOneUse();
5640  }
5641  return false;
5642}
5643
5644SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5645  bool addTest = true;
5646  SDValue Chain = Op.getOperand(0);
5647  SDValue Cond  = Op.getOperand(1);
5648  SDValue Dest  = Op.getOperand(2);
5649  DebugLoc dl = Op.getDebugLoc();
5650  SDValue CC;
5651
5652  if (Cond.getOpcode() == ISD::SETCC)
5653    Cond = LowerSETCC(Cond, DAG);
5654#if 0
5655  // FIXME: LowerXALUO doesn't handle these!!
5656  else if (Cond.getOpcode() == X86ISD::ADD  ||
5657           Cond.getOpcode() == X86ISD::SUB  ||
5658           Cond.getOpcode() == X86ISD::SMUL ||
5659           Cond.getOpcode() == X86ISD::UMUL)
5660    Cond = LowerXALUO(Cond, DAG);
5661#endif
5662
5663  // If condition flag is set by a X86ISD::CMP, then use it as the condition
5664  // setting operand in place of the X86ISD::SETCC.
5665  if (Cond.getOpcode() == X86ISD::SETCC) {
5666    CC = Cond.getOperand(0);
5667
5668    SDValue Cmp = Cond.getOperand(1);
5669    unsigned Opc = Cmp.getOpcode();
5670    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
5671    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
5672      Cond = Cmp;
5673      addTest = false;
5674    } else {
5675      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
5676      default: break;
5677      case X86::COND_O:
5678      case X86::COND_B:
5679        // These can only come from an arithmetic instruction with overflow,
5680        // e.g. SADDO, UADDO.
5681        Cond = Cond.getNode()->getOperand(1);
5682        addTest = false;
5683        break;
5684      }
5685    }
5686  } else {
5687    unsigned CondOpc;
5688    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
5689      SDValue Cmp = Cond.getOperand(0).getOperand(1);
5690      if (CondOpc == ISD::OR) {
5691        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
5692        // two branches instead of an explicit OR instruction with a
5693        // separate test.
5694        if (Cmp == Cond.getOperand(1).getOperand(1) &&
5695            isX86LogicalCmp(Cmp)) {
5696          CC = Cond.getOperand(0).getOperand(0);
5697          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5698                              Chain, Dest, CC, Cmp);
5699          CC = Cond.getOperand(1).getOperand(0);
5700          Cond = Cmp;
5701          addTest = false;
5702        }
5703      } else { // ISD::AND
5704        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
5705        // two branches instead of an explicit AND instruction with a
5706        // separate test. However, we only do this if this block doesn't
5707        // have a fall-through edge, because this requires an explicit
5708        // jmp when the condition is false.
5709        if (Cmp == Cond.getOperand(1).getOperand(1) &&
5710            isX86LogicalCmp(Cmp) &&
5711            Op.getNode()->hasOneUse()) {
5712          X86::CondCode CCode =
5713            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
5714          CCode = X86::GetOppositeBranchCondition(CCode);
5715          CC = DAG.getConstant(CCode, MVT::i8);
5716          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
5717          // Look for an unconditional branch following this conditional branch.
5718          // We need this because we need to reverse the successors in order
5719          // to implement FCMP_OEQ.
5720          if (User.getOpcode() == ISD::BR) {
5721            SDValue FalseBB = User.getOperand(1);
5722            SDValue NewBR =
5723              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
5724            assert(NewBR == User);
5725            Dest = FalseBB;
5726
5727            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5728                                Chain, Dest, CC, Cmp);
5729            X86::CondCode CCode =
5730              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
5731            CCode = X86::GetOppositeBranchCondition(CCode);
5732            CC = DAG.getConstant(CCode, MVT::i8);
5733            Cond = Cmp;
5734            addTest = false;
5735          }
5736        }
5737      }
5738    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
5739      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
5740      // It should be transformed during dag combiner except when the condition
5741      // is set by a arithmetics with overflow node.
5742      X86::CondCode CCode =
5743        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
5744      CCode = X86::GetOppositeBranchCondition(CCode);
5745      CC = DAG.getConstant(CCode, MVT::i8);
5746      Cond = Cond.getOperand(0).getOperand(1);
5747      addTest = false;
5748    }
5749  }
5750
5751  if (addTest) {
5752    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5753    Cond = EmitTest(Cond, X86::COND_NE, DAG);
5754  }
5755  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5756                     Chain, Dest, CC, Cond);
5757}
5758
5759
5760// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
5761// Calls to _alloca is needed to probe the stack when allocating more than 4k
5762// bytes in one go. Touching the stack at 4K increments is necessary to ensure
5763// that the guard pages used by the OS virtual memory manager are allocated in
5764// correct sequence.
5765SDValue
5766X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
5767                                           SelectionDAG &DAG) {
5768  assert(Subtarget->isTargetCygMing() &&
5769         "This should be used only on Cygwin/Mingw targets");
5770  DebugLoc dl = Op.getDebugLoc();
5771
5772  // Get the inputs.
5773  SDValue Chain = Op.getOperand(0);
5774  SDValue Size  = Op.getOperand(1);
5775  // FIXME: Ensure alignment here
5776
5777  SDValue Flag;
5778
5779  MVT IntPtr = getPointerTy();
5780  MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
5781
5782  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
5783
5784  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
5785  Flag = Chain.getValue(1);
5786
5787  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5788  SDValue Ops[] = { Chain,
5789                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
5790                      DAG.getRegister(X86::EAX, IntPtr),
5791                      DAG.getRegister(X86StackPtr, SPTy),
5792                      Flag };
5793  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
5794  Flag = Chain.getValue(1);
5795
5796  Chain = DAG.getCALLSEQ_END(Chain,
5797                             DAG.getIntPtrConstant(0, true),
5798                             DAG.getIntPtrConstant(0, true),
5799                             Flag);
5800
5801  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
5802
5803  SDValue Ops1[2] = { Chain.getValue(0), Chain };
5804  return DAG.getMergeValues(Ops1, 2, dl);
5805}
5806
5807SDValue
5808X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
5809                                           SDValue Chain,
5810                                           SDValue Dst, SDValue Src,
5811                                           SDValue Size, unsigned Align,
5812                                           const Value *DstSV,
5813                                           uint64_t DstSVOff) {
5814  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5815
5816  // If not DWORD aligned or size is more than the threshold, call the library.
5817  // The libc version is likely to be faster for these cases. It can use the
5818  // address value and run time information about the CPU.
5819  if ((Align & 3) != 0 ||
5820      !ConstantSize ||
5821      ConstantSize->getZExtValue() >
5822        getSubtarget()->getMaxInlineSizeThreshold()) {
5823    SDValue InFlag(0, 0);
5824
5825    // Check to see if there is a specialized entry-point for memory zeroing.
5826    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
5827
5828    if (const char *bzeroEntry =  V &&
5829        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
5830      MVT IntPtr = getPointerTy();
5831      const Type *IntPtrTy = TD->getIntPtrType();
5832      TargetLowering::ArgListTy Args;
5833      TargetLowering::ArgListEntry Entry;
5834      Entry.Node = Dst;
5835      Entry.Ty = IntPtrTy;
5836      Args.push_back(Entry);
5837      Entry.Node = Size;
5838      Args.push_back(Entry);
5839      std::pair<SDValue,SDValue> CallResult =
5840        LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
5841                    0, CallingConv::C, false, /*isReturnValueUsed=*/false,
5842                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
5843      return CallResult.second;
5844    }
5845
5846    // Otherwise have the target-independent code call memset.
5847    return SDValue();
5848  }
5849
5850  uint64_t SizeVal = ConstantSize->getZExtValue();
5851  SDValue InFlag(0, 0);
5852  MVT AVT;
5853  SDValue Count;
5854  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
5855  unsigned BytesLeft = 0;
5856  bool TwoRepStos = false;
5857  if (ValC) {
5858    unsigned ValReg;
5859    uint64_t Val = ValC->getZExtValue() & 255;
5860
5861    // If the value is a constant, then we can potentially use larger sets.
5862    switch (Align & 3) {
5863    case 2:   // WORD aligned
5864      AVT = MVT::i16;
5865      ValReg = X86::AX;
5866      Val = (Val << 8) | Val;
5867      break;
5868    case 0:  // DWORD aligned
5869      AVT = MVT::i32;
5870      ValReg = X86::EAX;
5871      Val = (Val << 8)  | Val;
5872      Val = (Val << 16) | Val;
5873      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
5874        AVT = MVT::i64;
5875        ValReg = X86::RAX;
5876        Val = (Val << 32) | Val;
5877      }
5878      break;
5879    default:  // Byte aligned
5880      AVT = MVT::i8;
5881      ValReg = X86::AL;
5882      Count = DAG.getIntPtrConstant(SizeVal);
5883      break;
5884    }
5885
5886    if (AVT.bitsGT(MVT::i8)) {
5887      unsigned UBytes = AVT.getSizeInBits() / 8;
5888      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
5889      BytesLeft = SizeVal % UBytes;
5890    }
5891
5892    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
5893                              InFlag);
5894    InFlag = Chain.getValue(1);
5895  } else {
5896    AVT = MVT::i8;
5897    Count  = DAG.getIntPtrConstant(SizeVal);
5898    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
5899    InFlag = Chain.getValue(1);
5900  }
5901
5902  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
5903                                                              X86::ECX,
5904                            Count, InFlag);
5905  InFlag = Chain.getValue(1);
5906  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
5907                                                              X86::EDI,
5908                            Dst, InFlag);
5909  InFlag = Chain.getValue(1);
5910
5911  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5912  SmallVector<SDValue, 8> Ops;
5913  Ops.push_back(Chain);
5914  Ops.push_back(DAG.getValueType(AVT));
5915  Ops.push_back(InFlag);
5916  Chain  = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
5917
5918  if (TwoRepStos) {
5919    InFlag = Chain.getValue(1);
5920    Count  = Size;
5921    MVT CVT = Count.getValueType();
5922    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
5923                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
5924    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
5925                                                             X86::ECX,
5926                              Left, InFlag);
5927    InFlag = Chain.getValue(1);
5928    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5929    Ops.clear();
5930    Ops.push_back(Chain);
5931    Ops.push_back(DAG.getValueType(MVT::i8));
5932    Ops.push_back(InFlag);
5933    Chain  = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
5934  } else if (BytesLeft) {
5935    // Handle the last 1 - 7 bytes.
5936    unsigned Offset = SizeVal - BytesLeft;
5937    MVT AddrVT = Dst.getValueType();
5938    MVT SizeVT = Size.getValueType();
5939
5940    Chain = DAG.getMemset(Chain, dl,
5941                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
5942                                      DAG.getConstant(Offset, AddrVT)),
5943                          Src,
5944                          DAG.getConstant(BytesLeft, SizeVT),
5945                          Align, DstSV, DstSVOff + Offset);
5946  }
5947
5948  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
5949  return Chain;
5950}
5951
5952SDValue
5953X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
5954                                      SDValue Chain, SDValue Dst, SDValue Src,
5955                                      SDValue Size, unsigned Align,
5956                                      bool AlwaysInline,
5957                                      const Value *DstSV, uint64_t DstSVOff,
5958                                      const Value *SrcSV, uint64_t SrcSVOff) {
5959  // This requires the copy size to be a constant, preferrably
5960  // within a subtarget-specific limit.
5961  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5962  if (!ConstantSize)
5963    return SDValue();
5964  uint64_t SizeVal = ConstantSize->getZExtValue();
5965  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
5966    return SDValue();
5967
5968  /// If not DWORD aligned, call the library.
5969  if ((Align & 3) != 0)
5970    return SDValue();
5971
5972  // DWORD aligned
5973  MVT AVT = MVT::i32;
5974  if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
5975    AVT = MVT::i64;
5976
5977  unsigned UBytes = AVT.getSizeInBits() / 8;
5978  unsigned CountVal = SizeVal / UBytes;
5979  SDValue Count = DAG.getIntPtrConstant(CountVal);
5980  unsigned BytesLeft = SizeVal % UBytes;
5981
5982  SDValue InFlag(0, 0);
5983  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
5984                                                              X86::ECX,
5985                            Count, InFlag);
5986  InFlag = Chain.getValue(1);
5987  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
5988                                                             X86::EDI,
5989                            Dst, InFlag);
5990  InFlag = Chain.getValue(1);
5991  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
5992                                                              X86::ESI,
5993                            Src, InFlag);
5994  InFlag = Chain.getValue(1);
5995
5996  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5997  SmallVector<SDValue, 8> Ops;
5998  Ops.push_back(Chain);
5999  Ops.push_back(DAG.getValueType(AVT));
6000  Ops.push_back(InFlag);
6001  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size());
6002
6003  SmallVector<SDValue, 4> Results;
6004  Results.push_back(RepMovs);
6005  if (BytesLeft) {
6006    // Handle the last 1 - 7 bytes.
6007    unsigned Offset = SizeVal - BytesLeft;
6008    MVT DstVT = Dst.getValueType();
6009    MVT SrcVT = Src.getValueType();
6010    MVT SizeVT = Size.getValueType();
6011    Results.push_back(DAG.getMemcpy(Chain, dl,
6012                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
6013                                                DAG.getConstant(Offset, DstVT)),
6014                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
6015                                                DAG.getConstant(Offset, SrcVT)),
6016                                    DAG.getConstant(BytesLeft, SizeVT),
6017                                    Align, AlwaysInline,
6018                                    DstSV, DstSVOff + Offset,
6019                                    SrcSV, SrcSVOff + Offset));
6020  }
6021
6022  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6023                     &Results[0], Results.size());
6024}
6025
6026SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
6027  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6028  DebugLoc dl = Op.getDebugLoc();
6029
6030  if (!Subtarget->is64Bit()) {
6031    // vastart just stores the address of the VarArgsFrameIndex slot into the
6032    // memory location argument.
6033    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6034    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
6035  }
6036
6037  // __va_list_tag:
6038  //   gp_offset         (0 - 6 * 8)
6039  //   fp_offset         (48 - 48 + 8 * 16)
6040  //   overflow_arg_area (point to parameters coming in memory).
6041  //   reg_save_area
6042  SmallVector<SDValue, 8> MemOps;
6043  SDValue FIN = Op.getOperand(1);
6044  // Store gp_offset
6045  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6046                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
6047                                 FIN, SV, 0);
6048  MemOps.push_back(Store);
6049
6050  // Store fp_offset
6051  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6052                    FIN, DAG.getIntPtrConstant(4));
6053  Store = DAG.getStore(Op.getOperand(0), dl,
6054                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
6055                       FIN, SV, 0);
6056  MemOps.push_back(Store);
6057
6058  // Store ptr to overflow_arg_area
6059  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6060                    FIN, DAG.getIntPtrConstant(4));
6061  SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6062  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
6063  MemOps.push_back(Store);
6064
6065  // Store ptr to reg_save_area.
6066  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6067                    FIN, DAG.getIntPtrConstant(8));
6068  SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
6069  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
6070  MemOps.push_back(Store);
6071  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6072                     &MemOps[0], MemOps.size());
6073}
6074
6075SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
6076  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6077  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6078  SDValue Chain = Op.getOperand(0);
6079  SDValue SrcPtr = Op.getOperand(1);
6080  SDValue SrcSV = Op.getOperand(2);
6081
6082  llvm_report_error("VAArgInst is not yet implemented for x86-64!");
6083  return SDValue();
6084}
6085
6086SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
6087  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6088  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6089  SDValue Chain = Op.getOperand(0);
6090  SDValue DstPtr = Op.getOperand(1);
6091  SDValue SrcPtr = Op.getOperand(2);
6092  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6093  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6094  DebugLoc dl = Op.getDebugLoc();
6095
6096  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6097                       DAG.getIntPtrConstant(24), 8, false,
6098                       DstSV, 0, SrcSV, 0);
6099}
6100
6101SDValue
6102X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
6103  DebugLoc dl = Op.getDebugLoc();
6104  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6105  switch (IntNo) {
6106  default: return SDValue();    // Don't custom lower most intrinsics.
6107  // Comparison intrinsics.
6108  case Intrinsic::x86_sse_comieq_ss:
6109  case Intrinsic::x86_sse_comilt_ss:
6110  case Intrinsic::x86_sse_comile_ss:
6111  case Intrinsic::x86_sse_comigt_ss:
6112  case Intrinsic::x86_sse_comige_ss:
6113  case Intrinsic::x86_sse_comineq_ss:
6114  case Intrinsic::x86_sse_ucomieq_ss:
6115  case Intrinsic::x86_sse_ucomilt_ss:
6116  case Intrinsic::x86_sse_ucomile_ss:
6117  case Intrinsic::x86_sse_ucomigt_ss:
6118  case Intrinsic::x86_sse_ucomige_ss:
6119  case Intrinsic::x86_sse_ucomineq_ss:
6120  case Intrinsic::x86_sse2_comieq_sd:
6121  case Intrinsic::x86_sse2_comilt_sd:
6122  case Intrinsic::x86_sse2_comile_sd:
6123  case Intrinsic::x86_sse2_comigt_sd:
6124  case Intrinsic::x86_sse2_comige_sd:
6125  case Intrinsic::x86_sse2_comineq_sd:
6126  case Intrinsic::x86_sse2_ucomieq_sd:
6127  case Intrinsic::x86_sse2_ucomilt_sd:
6128  case Intrinsic::x86_sse2_ucomile_sd:
6129  case Intrinsic::x86_sse2_ucomigt_sd:
6130  case Intrinsic::x86_sse2_ucomige_sd:
6131  case Intrinsic::x86_sse2_ucomineq_sd: {
6132    unsigned Opc = 0;
6133    ISD::CondCode CC = ISD::SETCC_INVALID;
6134    switch (IntNo) {
6135    default: break;
6136    case Intrinsic::x86_sse_comieq_ss:
6137    case Intrinsic::x86_sse2_comieq_sd:
6138      Opc = X86ISD::COMI;
6139      CC = ISD::SETEQ;
6140      break;
6141    case Intrinsic::x86_sse_comilt_ss:
6142    case Intrinsic::x86_sse2_comilt_sd:
6143      Opc = X86ISD::COMI;
6144      CC = ISD::SETLT;
6145      break;
6146    case Intrinsic::x86_sse_comile_ss:
6147    case Intrinsic::x86_sse2_comile_sd:
6148      Opc = X86ISD::COMI;
6149      CC = ISD::SETLE;
6150      break;
6151    case Intrinsic::x86_sse_comigt_ss:
6152    case Intrinsic::x86_sse2_comigt_sd:
6153      Opc = X86ISD::COMI;
6154      CC = ISD::SETGT;
6155      break;
6156    case Intrinsic::x86_sse_comige_ss:
6157    case Intrinsic::x86_sse2_comige_sd:
6158      Opc = X86ISD::COMI;
6159      CC = ISD::SETGE;
6160      break;
6161    case Intrinsic::x86_sse_comineq_ss:
6162    case Intrinsic::x86_sse2_comineq_sd:
6163      Opc = X86ISD::COMI;
6164      CC = ISD::SETNE;
6165      break;
6166    case Intrinsic::x86_sse_ucomieq_ss:
6167    case Intrinsic::x86_sse2_ucomieq_sd:
6168      Opc = X86ISD::UCOMI;
6169      CC = ISD::SETEQ;
6170      break;
6171    case Intrinsic::x86_sse_ucomilt_ss:
6172    case Intrinsic::x86_sse2_ucomilt_sd:
6173      Opc = X86ISD::UCOMI;
6174      CC = ISD::SETLT;
6175      break;
6176    case Intrinsic::x86_sse_ucomile_ss:
6177    case Intrinsic::x86_sse2_ucomile_sd:
6178      Opc = X86ISD::UCOMI;
6179      CC = ISD::SETLE;
6180      break;
6181    case Intrinsic::x86_sse_ucomigt_ss:
6182    case Intrinsic::x86_sse2_ucomigt_sd:
6183      Opc = X86ISD::UCOMI;
6184      CC = ISD::SETGT;
6185      break;
6186    case Intrinsic::x86_sse_ucomige_ss:
6187    case Intrinsic::x86_sse2_ucomige_sd:
6188      Opc = X86ISD::UCOMI;
6189      CC = ISD::SETGE;
6190      break;
6191    case Intrinsic::x86_sse_ucomineq_ss:
6192    case Intrinsic::x86_sse2_ucomineq_sd:
6193      Opc = X86ISD::UCOMI;
6194      CC = ISD::SETNE;
6195      break;
6196    }
6197
6198    SDValue LHS = Op.getOperand(1);
6199    SDValue RHS = Op.getOperand(2);
6200    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6201    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6202    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6203                                DAG.getConstant(X86CC, MVT::i8), Cond);
6204    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6205  }
6206  // ptest intrinsics. The intrinsic these come from are designed to return
6207  // an integer value, not just an instruction so lower it to the ptest
6208  // pattern and a setcc for the result.
6209  case Intrinsic::x86_sse41_ptestz:
6210  case Intrinsic::x86_sse41_ptestc:
6211  case Intrinsic::x86_sse41_ptestnzc:{
6212    unsigned X86CC = 0;
6213    switch (IntNo) {
6214    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6215    case Intrinsic::x86_sse41_ptestz:
6216      // ZF = 1
6217      X86CC = X86::COND_E;
6218      break;
6219    case Intrinsic::x86_sse41_ptestc:
6220      // CF = 1
6221      X86CC = X86::COND_B;
6222      break;
6223    case Intrinsic::x86_sse41_ptestnzc:
6224      // ZF and CF = 0
6225      X86CC = X86::COND_A;
6226      break;
6227    }
6228
6229    SDValue LHS = Op.getOperand(1);
6230    SDValue RHS = Op.getOperand(2);
6231    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6232    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6233    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6234    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6235  }
6236
6237  // Fix vector shift instructions where the last operand is a non-immediate
6238  // i32 value.
6239  case Intrinsic::x86_sse2_pslli_w:
6240  case Intrinsic::x86_sse2_pslli_d:
6241  case Intrinsic::x86_sse2_pslli_q:
6242  case Intrinsic::x86_sse2_psrli_w:
6243  case Intrinsic::x86_sse2_psrli_d:
6244  case Intrinsic::x86_sse2_psrli_q:
6245  case Intrinsic::x86_sse2_psrai_w:
6246  case Intrinsic::x86_sse2_psrai_d:
6247  case Intrinsic::x86_mmx_pslli_w:
6248  case Intrinsic::x86_mmx_pslli_d:
6249  case Intrinsic::x86_mmx_pslli_q:
6250  case Intrinsic::x86_mmx_psrli_w:
6251  case Intrinsic::x86_mmx_psrli_d:
6252  case Intrinsic::x86_mmx_psrli_q:
6253  case Intrinsic::x86_mmx_psrai_w:
6254  case Intrinsic::x86_mmx_psrai_d: {
6255    SDValue ShAmt = Op.getOperand(2);
6256    if (isa<ConstantSDNode>(ShAmt))
6257      return SDValue();
6258
6259    unsigned NewIntNo = 0;
6260    MVT ShAmtVT = MVT::v4i32;
6261    switch (IntNo) {
6262    case Intrinsic::x86_sse2_pslli_w:
6263      NewIntNo = Intrinsic::x86_sse2_psll_w;
6264      break;
6265    case Intrinsic::x86_sse2_pslli_d:
6266      NewIntNo = Intrinsic::x86_sse2_psll_d;
6267      break;
6268    case Intrinsic::x86_sse2_pslli_q:
6269      NewIntNo = Intrinsic::x86_sse2_psll_q;
6270      break;
6271    case Intrinsic::x86_sse2_psrli_w:
6272      NewIntNo = Intrinsic::x86_sse2_psrl_w;
6273      break;
6274    case Intrinsic::x86_sse2_psrli_d:
6275      NewIntNo = Intrinsic::x86_sse2_psrl_d;
6276      break;
6277    case Intrinsic::x86_sse2_psrli_q:
6278      NewIntNo = Intrinsic::x86_sse2_psrl_q;
6279      break;
6280    case Intrinsic::x86_sse2_psrai_w:
6281      NewIntNo = Intrinsic::x86_sse2_psra_w;
6282      break;
6283    case Intrinsic::x86_sse2_psrai_d:
6284      NewIntNo = Intrinsic::x86_sse2_psra_d;
6285      break;
6286    default: {
6287      ShAmtVT = MVT::v2i32;
6288      switch (IntNo) {
6289      case Intrinsic::x86_mmx_pslli_w:
6290        NewIntNo = Intrinsic::x86_mmx_psll_w;
6291        break;
6292      case Intrinsic::x86_mmx_pslli_d:
6293        NewIntNo = Intrinsic::x86_mmx_psll_d;
6294        break;
6295      case Intrinsic::x86_mmx_pslli_q:
6296        NewIntNo = Intrinsic::x86_mmx_psll_q;
6297        break;
6298      case Intrinsic::x86_mmx_psrli_w:
6299        NewIntNo = Intrinsic::x86_mmx_psrl_w;
6300        break;
6301      case Intrinsic::x86_mmx_psrli_d:
6302        NewIntNo = Intrinsic::x86_mmx_psrl_d;
6303        break;
6304      case Intrinsic::x86_mmx_psrli_q:
6305        NewIntNo = Intrinsic::x86_mmx_psrl_q;
6306        break;
6307      case Intrinsic::x86_mmx_psrai_w:
6308        NewIntNo = Intrinsic::x86_mmx_psra_w;
6309        break;
6310      case Intrinsic::x86_mmx_psrai_d:
6311        NewIntNo = Intrinsic::x86_mmx_psra_d;
6312        break;
6313      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
6314      }
6315      break;
6316    }
6317    }
6318    MVT VT = Op.getValueType();
6319    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT,
6320                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt));
6321    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6322                       DAG.getConstant(NewIntNo, MVT::i32),
6323                       Op.getOperand(1), ShAmt);
6324  }
6325  }
6326}
6327
6328SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
6329  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6330  DebugLoc dl = Op.getDebugLoc();
6331
6332  if (Depth > 0) {
6333    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6334    SDValue Offset =
6335      DAG.getConstant(TD->getPointerSize(),
6336                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
6337    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6338                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
6339                                   FrameAddr, Offset),
6340                       NULL, 0);
6341  }
6342
6343  // Just load the return address.
6344  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
6345  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6346                     RetAddrFI, NULL, 0);
6347}
6348
6349SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
6350  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6351  MFI->setFrameAddressIsTaken(true);
6352  MVT VT = Op.getValueType();
6353  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
6354  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6355  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
6356  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6357  while (Depth--)
6358    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
6359  return FrameAddr;
6360}
6361
6362SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
6363                                                     SelectionDAG &DAG) {
6364  return DAG.getIntPtrConstant(2*TD->getPointerSize());
6365}
6366
6367SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
6368{
6369  MachineFunction &MF = DAG.getMachineFunction();
6370  SDValue Chain     = Op.getOperand(0);
6371  SDValue Offset    = Op.getOperand(1);
6372  SDValue Handler   = Op.getOperand(2);
6373  DebugLoc dl       = Op.getDebugLoc();
6374
6375  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
6376                                  getPointerTy());
6377  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
6378
6379  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
6380                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
6381  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
6382  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
6383  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
6384  MF.getRegInfo().addLiveOut(StoreAddrReg);
6385
6386  return DAG.getNode(X86ISD::EH_RETURN, dl,
6387                     MVT::Other,
6388                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
6389}
6390
6391SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
6392                                             SelectionDAG &DAG) {
6393  SDValue Root = Op.getOperand(0);
6394  SDValue Trmp = Op.getOperand(1); // trampoline
6395  SDValue FPtr = Op.getOperand(2); // nested function
6396  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
6397  DebugLoc dl  = Op.getDebugLoc();
6398
6399  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6400
6401  const X86InstrInfo *TII =
6402    ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
6403
6404  if (Subtarget->is64Bit()) {
6405    SDValue OutChains[6];
6406
6407    // Large code-model.
6408
6409    const unsigned char JMP64r  = TII->getBaseOpcodeFor(X86::JMP64r);
6410    const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
6411
6412    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
6413    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
6414
6415    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
6416
6417    // Load the pointer to the nested function into R11.
6418    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
6419    SDValue Addr = Trmp;
6420    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6421                                Addr, TrmpAddr, 0);
6422
6423    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6424                       DAG.getConstant(2, MVT::i64));
6425    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
6426
6427    // Load the 'nest' parameter value into R10.
6428    // R10 is specified in X86CallingConv.td
6429    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
6430    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6431                       DAG.getConstant(10, MVT::i64));
6432    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6433                                Addr, TrmpAddr, 10);
6434
6435    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6436                       DAG.getConstant(12, MVT::i64));
6437    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
6438
6439    // Jump to the nested function.
6440    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
6441    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6442                       DAG.getConstant(20, MVT::i64));
6443    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6444                                Addr, TrmpAddr, 20);
6445
6446    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
6447    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6448                       DAG.getConstant(22, MVT::i64));
6449    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
6450                                TrmpAddr, 22);
6451
6452    SDValue Ops[] =
6453      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
6454    return DAG.getMergeValues(Ops, 2, dl);
6455  } else {
6456    const Function *Func =
6457      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
6458    unsigned CC = Func->getCallingConv();
6459    unsigned NestReg;
6460
6461    switch (CC) {
6462    default:
6463      llvm_unreachable("Unsupported calling convention");
6464    case CallingConv::C:
6465    case CallingConv::X86_StdCall: {
6466      // Pass 'nest' parameter in ECX.
6467      // Must be kept in sync with X86CallingConv.td
6468      NestReg = X86::ECX;
6469
6470      // Check that ECX wasn't needed by an 'inreg' parameter.
6471      const FunctionType *FTy = Func->getFunctionType();
6472      const AttrListPtr &Attrs = Func->getAttributes();
6473
6474      if (!Attrs.isEmpty() && !Func->isVarArg()) {
6475        unsigned InRegCount = 0;
6476        unsigned Idx = 1;
6477
6478        for (FunctionType::param_iterator I = FTy->param_begin(),
6479             E = FTy->param_end(); I != E; ++I, ++Idx)
6480          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
6481            // FIXME: should only count parameters that are lowered to integers.
6482            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
6483
6484        if (InRegCount > 2) {
6485          llvm_report_error("Nest register in use - reduce number of inreg parameters!");
6486        }
6487      }
6488      break;
6489    }
6490    case CallingConv::X86_FastCall:
6491    case CallingConv::Fast:
6492      // Pass 'nest' parameter in EAX.
6493      // Must be kept in sync with X86CallingConv.td
6494      NestReg = X86::EAX;
6495      break;
6496    }
6497
6498    SDValue OutChains[4];
6499    SDValue Addr, Disp;
6500
6501    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6502                       DAG.getConstant(10, MVT::i32));
6503    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
6504
6505    const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
6506    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
6507    OutChains[0] = DAG.getStore(Root, dl,
6508                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
6509                                Trmp, TrmpAddr, 0);
6510
6511    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6512                       DAG.getConstant(1, MVT::i32));
6513    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
6514
6515    const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
6516    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6517                       DAG.getConstant(5, MVT::i32));
6518    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
6519                                TrmpAddr, 5, false, 1);
6520
6521    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6522                       DAG.getConstant(6, MVT::i32));
6523    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
6524
6525    SDValue Ops[] =
6526      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
6527    return DAG.getMergeValues(Ops, 2, dl);
6528  }
6529}
6530
6531SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
6532  /*
6533   The rounding mode is in bits 11:10 of FPSR, and has the following
6534   settings:
6535     00 Round to nearest
6536     01 Round to -inf
6537     10 Round to +inf
6538     11 Round to 0
6539
6540  FLT_ROUNDS, on the other hand, expects the following:
6541    -1 Undefined
6542     0 Round to 0
6543     1 Round to nearest
6544     2 Round to +inf
6545     3 Round to -inf
6546
6547  To perform the conversion, we do:
6548    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
6549  */
6550
6551  MachineFunction &MF = DAG.getMachineFunction();
6552  const TargetMachine &TM = MF.getTarget();
6553  const TargetFrameInfo &TFI = *TM.getFrameInfo();
6554  unsigned StackAlignment = TFI.getStackAlignment();
6555  MVT VT = Op.getValueType();
6556  DebugLoc dl = Op.getDebugLoc();
6557
6558  // Save FP Control Word to stack slot
6559  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
6560  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6561
6562  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
6563                              DAG.getEntryNode(), StackSlot);
6564
6565  // Load FP Control Word from stack slot
6566  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
6567
6568  // Transform as necessary
6569  SDValue CWD1 =
6570    DAG.getNode(ISD::SRL, dl, MVT::i16,
6571                DAG.getNode(ISD::AND, dl, MVT::i16,
6572                            CWD, DAG.getConstant(0x800, MVT::i16)),
6573                DAG.getConstant(11, MVT::i8));
6574  SDValue CWD2 =
6575    DAG.getNode(ISD::SRL, dl, MVT::i16,
6576                DAG.getNode(ISD::AND, dl, MVT::i16,
6577                            CWD, DAG.getConstant(0x400, MVT::i16)),
6578                DAG.getConstant(9, MVT::i8));
6579
6580  SDValue RetVal =
6581    DAG.getNode(ISD::AND, dl, MVT::i16,
6582                DAG.getNode(ISD::ADD, dl, MVT::i16,
6583                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
6584                            DAG.getConstant(1, MVT::i16)),
6585                DAG.getConstant(3, MVT::i16));
6586
6587
6588  return DAG.getNode((VT.getSizeInBits() < 16 ?
6589                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
6590}
6591
6592SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
6593  MVT VT = Op.getValueType();
6594  MVT OpVT = VT;
6595  unsigned NumBits = VT.getSizeInBits();
6596  DebugLoc dl = Op.getDebugLoc();
6597
6598  Op = Op.getOperand(0);
6599  if (VT == MVT::i8) {
6600    // Zero extend to i32 since there is not an i8 bsr.
6601    OpVT = MVT::i32;
6602    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6603  }
6604
6605  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
6606  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6607  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
6608
6609  // If src is zero (i.e. bsr sets ZF), returns NumBits.
6610  SmallVector<SDValue, 4> Ops;
6611  Ops.push_back(Op);
6612  Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT));
6613  Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
6614  Ops.push_back(Op.getValue(1));
6615  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
6616
6617  // Finally xor with NumBits-1.
6618  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
6619
6620  if (VT == MVT::i8)
6621    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6622  return Op;
6623}
6624
6625SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
6626  MVT VT = Op.getValueType();
6627  MVT OpVT = VT;
6628  unsigned NumBits = VT.getSizeInBits();
6629  DebugLoc dl = Op.getDebugLoc();
6630
6631  Op = Op.getOperand(0);
6632  if (VT == MVT::i8) {
6633    OpVT = MVT::i32;
6634    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6635  }
6636
6637  // Issue a bsf (scan bits forward) which also sets EFLAGS.
6638  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6639  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
6640
6641  // If src is zero (i.e. bsf sets ZF), returns NumBits.
6642  SmallVector<SDValue, 4> Ops;
6643  Ops.push_back(Op);
6644  Ops.push_back(DAG.getConstant(NumBits, OpVT));
6645  Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
6646  Ops.push_back(Op.getValue(1));
6647  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
6648
6649  if (VT == MVT::i8)
6650    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6651  return Op;
6652}
6653
6654SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
6655  MVT VT = Op.getValueType();
6656  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
6657  DebugLoc dl = Op.getDebugLoc();
6658
6659  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
6660  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
6661  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
6662  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
6663  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
6664  //
6665  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
6666  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
6667  //  return AloBlo + AloBhi + AhiBlo;
6668
6669  SDValue A = Op.getOperand(0);
6670  SDValue B = Op.getOperand(1);
6671
6672  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6673                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
6674                       A, DAG.getConstant(32, MVT::i32));
6675  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6676                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
6677                       B, DAG.getConstant(32, MVT::i32));
6678  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6679                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6680                       A, B);
6681  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6682                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6683                       A, Bhi);
6684  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6685                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6686                       Ahi, B);
6687  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6688                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
6689                       AloBhi, DAG.getConstant(32, MVT::i32));
6690  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6691                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
6692                       AhiBlo, DAG.getConstant(32, MVT::i32));
6693  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
6694  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
6695  return Res;
6696}
6697
6698
6699SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
6700  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
6701  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
6702  // looks for this combo and may remove the "setcc" instruction if the "setcc"
6703  // has only one use.
6704  SDNode *N = Op.getNode();
6705  SDValue LHS = N->getOperand(0);
6706  SDValue RHS = N->getOperand(1);
6707  unsigned BaseOp = 0;
6708  unsigned Cond = 0;
6709  DebugLoc dl = Op.getDebugLoc();
6710
6711  switch (Op.getOpcode()) {
6712  default: llvm_unreachable("Unknown ovf instruction!");
6713  case ISD::SADDO:
6714    // A subtract of one will be selected as a INC. Note that INC doesn't
6715    // set CF, so we can't do this for UADDO.
6716    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
6717      if (C->getAPIntValue() == 1) {
6718        BaseOp = X86ISD::INC;
6719        Cond = X86::COND_O;
6720        break;
6721      }
6722    BaseOp = X86ISD::ADD;
6723    Cond = X86::COND_O;
6724    break;
6725  case ISD::UADDO:
6726    BaseOp = X86ISD::ADD;
6727    Cond = X86::COND_B;
6728    break;
6729  case ISD::SSUBO:
6730    // A subtract of one will be selected as a DEC. Note that DEC doesn't
6731    // set CF, so we can't do this for USUBO.
6732    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
6733      if (C->getAPIntValue() == 1) {
6734        BaseOp = X86ISD::DEC;
6735        Cond = X86::COND_O;
6736        break;
6737      }
6738    BaseOp = X86ISD::SUB;
6739    Cond = X86::COND_O;
6740    break;
6741  case ISD::USUBO:
6742    BaseOp = X86ISD::SUB;
6743    Cond = X86::COND_B;
6744    break;
6745  case ISD::SMULO:
6746    BaseOp = X86ISD::SMUL;
6747    Cond = X86::COND_O;
6748    break;
6749  case ISD::UMULO:
6750    BaseOp = X86ISD::UMUL;
6751    Cond = X86::COND_B;
6752    break;
6753  }
6754
6755  // Also sets EFLAGS.
6756  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
6757  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
6758
6759  SDValue SetCC =
6760    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
6761                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
6762
6763  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
6764  return Sum;
6765}
6766
6767SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
6768  MVT T = Op.getValueType();
6769  DebugLoc dl = Op.getDebugLoc();
6770  unsigned Reg = 0;
6771  unsigned size = 0;
6772  switch(T.getSimpleVT()) {
6773  default:
6774    assert(false && "Invalid value type!");
6775  case MVT::i8:  Reg = X86::AL;  size = 1; break;
6776  case MVT::i16: Reg = X86::AX;  size = 2; break;
6777  case MVT::i32: Reg = X86::EAX; size = 4; break;
6778  case MVT::i64:
6779    assert(Subtarget->is64Bit() && "Node not type legal!");
6780    Reg = X86::RAX; size = 8;
6781    break;
6782  }
6783  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
6784                                    Op.getOperand(2), SDValue());
6785  SDValue Ops[] = { cpIn.getValue(0),
6786                    Op.getOperand(1),
6787                    Op.getOperand(3),
6788                    DAG.getTargetConstant(size, MVT::i8),
6789                    cpIn.getValue(1) };
6790  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6791  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
6792  SDValue cpOut =
6793    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
6794  return cpOut;
6795}
6796
6797SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
6798                                                 SelectionDAG &DAG) {
6799  assert(Subtarget->is64Bit() && "Result not type legalized?");
6800  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6801  SDValue TheChain = Op.getOperand(0);
6802  DebugLoc dl = Op.getDebugLoc();
6803  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
6804  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
6805  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
6806                                   rax.getValue(2));
6807  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
6808                            DAG.getConstant(32, MVT::i8));
6809  SDValue Ops[] = {
6810    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
6811    rdx.getValue(1)
6812  };
6813  return DAG.getMergeValues(Ops, 2, dl);
6814}
6815
6816SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
6817  SDNode *Node = Op.getNode();
6818  DebugLoc dl = Node->getDebugLoc();
6819  MVT T = Node->getValueType(0);
6820  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
6821                              DAG.getConstant(0, T), Node->getOperand(2));
6822  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
6823                       cast<AtomicSDNode>(Node)->getMemoryVT(),
6824                       Node->getOperand(0),
6825                       Node->getOperand(1), negOp,
6826                       cast<AtomicSDNode>(Node)->getSrcValue(),
6827                       cast<AtomicSDNode>(Node)->getAlignment());
6828}
6829
6830/// LowerOperation - Provide custom lowering hooks for some operations.
6831///
6832SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
6833  switch (Op.getOpcode()) {
6834  default: llvm_unreachable("Should not custom lower this!");
6835  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
6836  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
6837  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
6838  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
6839  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6840  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
6841  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
6842  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
6843  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
6844  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
6845  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
6846  case ISD::SHL_PARTS:
6847  case ISD::SRA_PARTS:
6848  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
6849  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
6850  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
6851  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
6852  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
6853  case ISD::FABS:               return LowerFABS(Op, DAG);
6854  case ISD::FNEG:               return LowerFNEG(Op, DAG);
6855  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
6856  case ISD::SETCC:              return LowerSETCC(Op, DAG);
6857  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
6858  case ISD::SELECT:             return LowerSELECT(Op, DAG);
6859  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
6860  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
6861  case ISD::VASTART:            return LowerVASTART(Op, DAG);
6862  case ISD::VAARG:              return LowerVAARG(Op, DAG);
6863  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
6864  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6865  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
6866  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
6867  case ISD::FRAME_TO_ARGS_OFFSET:
6868                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
6869  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
6870  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
6871  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
6872  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
6873  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
6874  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
6875  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
6876  case ISD::SADDO:
6877  case ISD::UADDO:
6878  case ISD::SSUBO:
6879  case ISD::USUBO:
6880  case ISD::SMULO:
6881  case ISD::UMULO:              return LowerXALUO(Op, DAG);
6882  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
6883  }
6884}
6885
6886void X86TargetLowering::
6887ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
6888                        SelectionDAG &DAG, unsigned NewOp) {
6889  MVT T = Node->getValueType(0);
6890  DebugLoc dl = Node->getDebugLoc();
6891  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
6892
6893  SDValue Chain = Node->getOperand(0);
6894  SDValue In1 = Node->getOperand(1);
6895  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6896                             Node->getOperand(2), DAG.getIntPtrConstant(0));
6897  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6898                             Node->getOperand(2), DAG.getIntPtrConstant(1));
6899  // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
6900  // have a MemOperand.  Pass the info through as a normal operand.
6901  SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
6902  SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
6903  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
6904  SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5);
6905  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
6906  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
6907  Results.push_back(Result.getValue(2));
6908}
6909
6910/// ReplaceNodeResults - Replace a node with an illegal result type
6911/// with a new node built out of custom code.
6912void X86TargetLowering::ReplaceNodeResults(SDNode *N,
6913                                           SmallVectorImpl<SDValue>&Results,
6914                                           SelectionDAG &DAG) {
6915  DebugLoc dl = N->getDebugLoc();
6916  switch (N->getOpcode()) {
6917  default:
6918    assert(false && "Do not know how to custom type legalize this operation!");
6919    return;
6920  case ISD::FP_TO_SINT: {
6921    std::pair<SDValue,SDValue> Vals =
6922        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
6923    SDValue FIST = Vals.first, StackSlot = Vals.second;
6924    if (FIST.getNode() != 0) {
6925      MVT VT = N->getValueType(0);
6926      // Return a load from the stack slot.
6927      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
6928    }
6929    return;
6930  }
6931  case ISD::READCYCLECOUNTER: {
6932    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6933    SDValue TheChain = N->getOperand(0);
6934    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
6935    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
6936                                     rd.getValue(1));
6937    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
6938                                     eax.getValue(2));
6939    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
6940    SDValue Ops[] = { eax, edx };
6941    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
6942    Results.push_back(edx.getValue(1));
6943    return;
6944  }
6945  case ISD::ATOMIC_CMP_SWAP: {
6946    MVT T = N->getValueType(0);
6947    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
6948    SDValue cpInL, cpInH;
6949    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
6950                        DAG.getConstant(0, MVT::i32));
6951    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
6952                        DAG.getConstant(1, MVT::i32));
6953    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
6954    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
6955                             cpInL.getValue(1));
6956    SDValue swapInL, swapInH;
6957    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
6958                          DAG.getConstant(0, MVT::i32));
6959    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
6960                          DAG.getConstant(1, MVT::i32));
6961    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
6962                               cpInH.getValue(1));
6963    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
6964                               swapInL.getValue(1));
6965    SDValue Ops[] = { swapInH.getValue(0),
6966                      N->getOperand(1),
6967                      swapInH.getValue(1) };
6968    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6969    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
6970    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
6971                                        MVT::i32, Result.getValue(1));
6972    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
6973                                        MVT::i32, cpOutL.getValue(2));
6974    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
6975    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
6976    Results.push_back(cpOutH.getValue(1));
6977    return;
6978  }
6979  case ISD::ATOMIC_LOAD_ADD:
6980    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
6981    return;
6982  case ISD::ATOMIC_LOAD_AND:
6983    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
6984    return;
6985  case ISD::ATOMIC_LOAD_NAND:
6986    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
6987    return;
6988  case ISD::ATOMIC_LOAD_OR:
6989    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
6990    return;
6991  case ISD::ATOMIC_LOAD_SUB:
6992    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
6993    return;
6994  case ISD::ATOMIC_LOAD_XOR:
6995    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
6996    return;
6997  case ISD::ATOMIC_SWAP:
6998    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
6999    return;
7000  }
7001}
7002
7003const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7004  switch (Opcode) {
7005  default: return NULL;
7006  case X86ISD::BSF:                return "X86ISD::BSF";
7007  case X86ISD::BSR:                return "X86ISD::BSR";
7008  case X86ISD::SHLD:               return "X86ISD::SHLD";
7009  case X86ISD::SHRD:               return "X86ISD::SHRD";
7010  case X86ISD::FAND:               return "X86ISD::FAND";
7011  case X86ISD::FOR:                return "X86ISD::FOR";
7012  case X86ISD::FXOR:               return "X86ISD::FXOR";
7013  case X86ISD::FSRL:               return "X86ISD::FSRL";
7014  case X86ISD::FILD:               return "X86ISD::FILD";
7015  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
7016  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7017  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7018  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7019  case X86ISD::FLD:                return "X86ISD::FLD";
7020  case X86ISD::FST:                return "X86ISD::FST";
7021  case X86ISD::CALL:               return "X86ISD::CALL";
7022  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
7023  case X86ISD::BT:                 return "X86ISD::BT";
7024  case X86ISD::CMP:                return "X86ISD::CMP";
7025  case X86ISD::COMI:               return "X86ISD::COMI";
7026  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
7027  case X86ISD::SETCC:              return "X86ISD::SETCC";
7028  case X86ISD::CMOV:               return "X86ISD::CMOV";
7029  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
7030  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
7031  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
7032  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
7033  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
7034  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
7035  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
7036  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
7037  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
7038  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
7039  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
7040  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
7041  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
7042  case X86ISD::FMAX:               return "X86ISD::FMAX";
7043  case X86ISD::FMIN:               return "X86ISD::FMIN";
7044  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
7045  case X86ISD::FRCP:               return "X86ISD::FRCP";
7046  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
7047  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7048  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
7049  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
7050  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
7051  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
7052  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
7053  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
7054  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
7055  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
7056  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
7057  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
7058  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
7059  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
7060  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
7061  case X86ISD::VSHL:               return "X86ISD::VSHL";
7062  case X86ISD::VSRL:               return "X86ISD::VSRL";
7063  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
7064  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
7065  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
7066  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
7067  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
7068  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
7069  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
7070  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
7071  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
7072  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
7073  case X86ISD::ADD:                return "X86ISD::ADD";
7074  case X86ISD::SUB:                return "X86ISD::SUB";
7075  case X86ISD::SMUL:               return "X86ISD::SMUL";
7076  case X86ISD::UMUL:               return "X86ISD::UMUL";
7077  case X86ISD::INC:                return "X86ISD::INC";
7078  case X86ISD::DEC:                return "X86ISD::DEC";
7079  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
7080  case X86ISD::PTEST:              return "X86ISD::PTEST";
7081  }
7082}
7083
7084// isLegalAddressingMode - Return true if the addressing mode represented
7085// by AM is legal for this target, for a load/store of the specified type.
7086bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7087                                              const Type *Ty) const {
7088  // X86 supports extremely general addressing modes.
7089  CodeModel::Model M = getTargetMachine().getCodeModel();
7090
7091  // X86 allows a sign-extended 32-bit immediate field as a displacement.
7092  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7093    return false;
7094
7095  if (AM.BaseGV) {
7096    unsigned GVFlags =
7097      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7098
7099    // If a reference to this global requires an extra load, we can't fold it.
7100    if (isGlobalStubReference(GVFlags))
7101      return false;
7102
7103    // If BaseGV requires a register for the PIC base, we cannot also have a
7104    // BaseReg specified.
7105    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7106      return false;
7107
7108    // If lower 4G is not available, then we must use rip-relative addressing.
7109    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7110      return false;
7111  }
7112
7113  switch (AM.Scale) {
7114  case 0:
7115  case 1:
7116  case 2:
7117  case 4:
7118  case 8:
7119    // These scales always work.
7120    break;
7121  case 3:
7122  case 5:
7123  case 9:
7124    // These scales are formed with basereg+scalereg.  Only accept if there is
7125    // no basereg yet.
7126    if (AM.HasBaseReg)
7127      return false;
7128    break;
7129  default:  // Other stuff never works.
7130    return false;
7131  }
7132
7133  return true;
7134}
7135
7136
7137bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7138  if (!Ty1->isInteger() || !Ty2->isInteger())
7139    return false;
7140  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7141  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7142  if (NumBits1 <= NumBits2)
7143    return false;
7144  return Subtarget->is64Bit() || NumBits1 < 64;
7145}
7146
7147bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
7148  if (!VT1.isInteger() || !VT2.isInteger())
7149    return false;
7150  unsigned NumBits1 = VT1.getSizeInBits();
7151  unsigned NumBits2 = VT2.getSizeInBits();
7152  if (NumBits1 <= NumBits2)
7153    return false;
7154  return Subtarget->is64Bit() || NumBits1 < 64;
7155}
7156
7157bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7158  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7159  return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit();
7160}
7161
7162bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
7163  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7164  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7165}
7166
7167bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
7168  // i16 instructions are longer (0x66 prefix) and potentially slower.
7169  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7170}
7171
7172/// isShuffleMaskLegal - Targets can use this to indicate that they only
7173/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7174/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7175/// are assumed to be legal.
7176bool
7177X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7178                                      MVT VT) const {
7179  // Only do shuffles on 128-bit vector types for now.
7180  if (VT.getSizeInBits() == 64)
7181    return false;
7182
7183  // FIXME: pshufb, blends, palignr, shifts.
7184  return (VT.getVectorNumElements() == 2 ||
7185          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7186          isMOVLMask(M, VT) ||
7187          isSHUFPMask(M, VT) ||
7188          isPSHUFDMask(M, VT) ||
7189          isPSHUFHWMask(M, VT) ||
7190          isPSHUFLWMask(M, VT) ||
7191          isUNPCKLMask(M, VT) ||
7192          isUNPCKHMask(M, VT) ||
7193          isUNPCKL_v_undef_Mask(M, VT) ||
7194          isUNPCKH_v_undef_Mask(M, VT));
7195}
7196
7197bool
7198X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7199                                          MVT VT) const {
7200  unsigned NumElts = VT.getVectorNumElements();
7201  // FIXME: This collection of masks seems suspect.
7202  if (NumElts == 2)
7203    return true;
7204  if (NumElts == 4 && VT.getSizeInBits() == 128) {
7205    return (isMOVLMask(Mask, VT)  ||
7206            isCommutedMOVLMask(Mask, VT, true) ||
7207            isSHUFPMask(Mask, VT) ||
7208            isCommutedSHUFPMask(Mask, VT));
7209  }
7210  return false;
7211}
7212
7213//===----------------------------------------------------------------------===//
7214//                           X86 Scheduler Hooks
7215//===----------------------------------------------------------------------===//
7216
7217// private utility function
7218MachineBasicBlock *
7219X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7220                                                       MachineBasicBlock *MBB,
7221                                                       unsigned regOpc,
7222                                                       unsigned immOpc,
7223                                                       unsigned LoadOpc,
7224                                                       unsigned CXchgOpc,
7225                                                       unsigned copyOpc,
7226                                                       unsigned notOpc,
7227                                                       unsigned EAXreg,
7228                                                       TargetRegisterClass *RC,
7229                                                       bool invSrc) const {
7230  // For the atomic bitwise operator, we generate
7231  //   thisMBB:
7232  //   newMBB:
7233  //     ld  t1 = [bitinstr.addr]
7234  //     op  t2 = t1, [bitinstr.val]
7235  //     mov EAX = t1
7236  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
7237  //     bz  newMBB
7238  //     fallthrough -->nextMBB
7239  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7240  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7241  MachineFunction::iterator MBBIter = MBB;
7242  ++MBBIter;
7243
7244  /// First build the CFG
7245  MachineFunction *F = MBB->getParent();
7246  MachineBasicBlock *thisMBB = MBB;
7247  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7248  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7249  F->insert(MBBIter, newMBB);
7250  F->insert(MBBIter, nextMBB);
7251
7252  // Move all successors to thisMBB to nextMBB
7253  nextMBB->transferSuccessors(thisMBB);
7254
7255  // Update thisMBB to fall through to newMBB
7256  thisMBB->addSuccessor(newMBB);
7257
7258  // newMBB jumps to itself and fall through to nextMBB
7259  newMBB->addSuccessor(nextMBB);
7260  newMBB->addSuccessor(newMBB);
7261
7262  // Insert instructions into newMBB based on incoming instruction
7263  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7264         "unexpected number of operands");
7265  DebugLoc dl = bInstr->getDebugLoc();
7266  MachineOperand& destOper = bInstr->getOperand(0);
7267  MachineOperand* argOpers[2 + X86AddrNumOperands];
7268  int numArgs = bInstr->getNumOperands() - 1;
7269  for (int i=0; i < numArgs; ++i)
7270    argOpers[i] = &bInstr->getOperand(i+1);
7271
7272  // x86 address has 4 operands: base, index, scale, and displacement
7273  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7274  int valArgIndx = lastAddrIndx + 1;
7275
7276  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7277  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
7278  for (int i=0; i <= lastAddrIndx; ++i)
7279    (*MIB).addOperand(*argOpers[i]);
7280
7281  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
7282  if (invSrc) {
7283    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
7284  }
7285  else
7286    tt = t1;
7287
7288  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7289  assert((argOpers[valArgIndx]->isReg() ||
7290          argOpers[valArgIndx]->isImm()) &&
7291         "invalid operand");
7292  if (argOpers[valArgIndx]->isReg())
7293    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
7294  else
7295    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
7296  MIB.addReg(tt);
7297  (*MIB).addOperand(*argOpers[valArgIndx]);
7298
7299  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
7300  MIB.addReg(t1);
7301
7302  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
7303  for (int i=0; i <= lastAddrIndx; ++i)
7304    (*MIB).addOperand(*argOpers[i]);
7305  MIB.addReg(t2);
7306  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7307  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
7308
7309  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
7310  MIB.addReg(EAXreg);
7311
7312  // insert branch
7313  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7314
7315  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
7316  return nextMBB;
7317}
7318
7319// private utility function:  64 bit atomics on 32 bit host.
7320MachineBasicBlock *
7321X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
7322                                                       MachineBasicBlock *MBB,
7323                                                       unsigned regOpcL,
7324                                                       unsigned regOpcH,
7325                                                       unsigned immOpcL,
7326                                                       unsigned immOpcH,
7327                                                       bool invSrc) const {
7328  // For the atomic bitwise operator, we generate
7329  //   thisMBB (instructions are in pairs, except cmpxchg8b)
7330  //     ld t1,t2 = [bitinstr.addr]
7331  //   newMBB:
7332  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
7333  //     op  t5, t6 <- out1, out2, [bitinstr.val]
7334  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
7335  //     mov ECX, EBX <- t5, t6
7336  //     mov EAX, EDX <- t1, t2
7337  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
7338  //     mov t3, t4 <- EAX, EDX
7339  //     bz  newMBB
7340  //     result in out1, out2
7341  //     fallthrough -->nextMBB
7342
7343  const TargetRegisterClass *RC = X86::GR32RegisterClass;
7344  const unsigned LoadOpc = X86::MOV32rm;
7345  const unsigned copyOpc = X86::MOV32rr;
7346  const unsigned NotOpc = X86::NOT32r;
7347  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7348  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7349  MachineFunction::iterator MBBIter = MBB;
7350  ++MBBIter;
7351
7352  /// First build the CFG
7353  MachineFunction *F = MBB->getParent();
7354  MachineBasicBlock *thisMBB = MBB;
7355  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7356  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7357  F->insert(MBBIter, newMBB);
7358  F->insert(MBBIter, nextMBB);
7359
7360  // Move all successors to thisMBB to nextMBB
7361  nextMBB->transferSuccessors(thisMBB);
7362
7363  // Update thisMBB to fall through to newMBB
7364  thisMBB->addSuccessor(newMBB);
7365
7366  // newMBB jumps to itself and fall through to nextMBB
7367  newMBB->addSuccessor(nextMBB);
7368  newMBB->addSuccessor(newMBB);
7369
7370  DebugLoc dl = bInstr->getDebugLoc();
7371  // Insert instructions into newMBB based on incoming instruction
7372  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
7373  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
7374         "unexpected number of operands");
7375  MachineOperand& dest1Oper = bInstr->getOperand(0);
7376  MachineOperand& dest2Oper = bInstr->getOperand(1);
7377  MachineOperand* argOpers[2 + X86AddrNumOperands];
7378  for (int i=0; i < 2 + X86AddrNumOperands; ++i)
7379    argOpers[i] = &bInstr->getOperand(i+2);
7380
7381  // x86 address has 4 operands: base, index, scale, and displacement
7382  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7383
7384  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7385  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
7386  for (int i=0; i <= lastAddrIndx; ++i)
7387    (*MIB).addOperand(*argOpers[i]);
7388  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7389  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
7390  // add 4 to displacement.
7391  for (int i=0; i <= lastAddrIndx-2; ++i)
7392    (*MIB).addOperand(*argOpers[i]);
7393  MachineOperand newOp3 = *(argOpers[3]);
7394  if (newOp3.isImm())
7395    newOp3.setImm(newOp3.getImm()+4);
7396  else
7397    newOp3.setOffset(newOp3.getOffset()+4);
7398  (*MIB).addOperand(newOp3);
7399  (*MIB).addOperand(*argOpers[lastAddrIndx]);
7400
7401  // t3/4 are defined later, at the bottom of the loop
7402  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
7403  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
7404  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
7405    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
7406  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
7407    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
7408
7409  unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
7410  unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
7411  if (invSrc) {
7412    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1);
7413    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2);
7414  } else {
7415    tt1 = t1;
7416    tt2 = t2;
7417  }
7418
7419  int valArgIndx = lastAddrIndx + 1;
7420  assert((argOpers[valArgIndx]->isReg() ||
7421          argOpers[valArgIndx]->isImm()) &&
7422         "invalid operand");
7423  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
7424  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
7425  if (argOpers[valArgIndx]->isReg())
7426    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
7427  else
7428    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
7429  if (regOpcL != X86::MOV32rr)
7430    MIB.addReg(tt1);
7431  (*MIB).addOperand(*argOpers[valArgIndx]);
7432  assert(argOpers[valArgIndx + 1]->isReg() ==
7433         argOpers[valArgIndx]->isReg());
7434  assert(argOpers[valArgIndx + 1]->isImm() ==
7435         argOpers[valArgIndx]->isImm());
7436  if (argOpers[valArgIndx + 1]->isReg())
7437    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
7438  else
7439    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
7440  if (regOpcH != X86::MOV32rr)
7441    MIB.addReg(tt2);
7442  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
7443
7444  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
7445  MIB.addReg(t1);
7446  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
7447  MIB.addReg(t2);
7448
7449  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
7450  MIB.addReg(t5);
7451  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
7452  MIB.addReg(t6);
7453
7454  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
7455  for (int i=0; i <= lastAddrIndx; ++i)
7456    (*MIB).addOperand(*argOpers[i]);
7457
7458  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7459  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
7460
7461  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
7462  MIB.addReg(X86::EAX);
7463  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
7464  MIB.addReg(X86::EDX);
7465
7466  // insert branch
7467  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7468
7469  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
7470  return nextMBB;
7471}
7472
7473// private utility function
7474MachineBasicBlock *
7475X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
7476                                                      MachineBasicBlock *MBB,
7477                                                      unsigned cmovOpc) const {
7478  // For the atomic min/max operator, we generate
7479  //   thisMBB:
7480  //   newMBB:
7481  //     ld t1 = [min/max.addr]
7482  //     mov t2 = [min/max.val]
7483  //     cmp  t1, t2
7484  //     cmov[cond] t2 = t1
7485  //     mov EAX = t1
7486  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
7487  //     bz   newMBB
7488  //     fallthrough -->nextMBB
7489  //
7490  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7491  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7492  MachineFunction::iterator MBBIter = MBB;
7493  ++MBBIter;
7494
7495  /// First build the CFG
7496  MachineFunction *F = MBB->getParent();
7497  MachineBasicBlock *thisMBB = MBB;
7498  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7499  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7500  F->insert(MBBIter, newMBB);
7501  F->insert(MBBIter, nextMBB);
7502
7503  // Move all successors to thisMBB to nextMBB
7504  nextMBB->transferSuccessors(thisMBB);
7505
7506  // Update thisMBB to fall through to newMBB
7507  thisMBB->addSuccessor(newMBB);
7508
7509  // newMBB jumps to newMBB and fall through to nextMBB
7510  newMBB->addSuccessor(nextMBB);
7511  newMBB->addSuccessor(newMBB);
7512
7513  DebugLoc dl = mInstr->getDebugLoc();
7514  // Insert instructions into newMBB based on incoming instruction
7515  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7516         "unexpected number of operands");
7517  MachineOperand& destOper = mInstr->getOperand(0);
7518  MachineOperand* argOpers[2 + X86AddrNumOperands];
7519  int numArgs = mInstr->getNumOperands() - 1;
7520  for (int i=0; i < numArgs; ++i)
7521    argOpers[i] = &mInstr->getOperand(i+1);
7522
7523  // x86 address has 4 operands: base, index, scale, and displacement
7524  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7525  int valArgIndx = lastAddrIndx + 1;
7526
7527  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7528  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
7529  for (int i=0; i <= lastAddrIndx; ++i)
7530    (*MIB).addOperand(*argOpers[i]);
7531
7532  // We only support register and immediate values
7533  assert((argOpers[valArgIndx]->isReg() ||
7534          argOpers[valArgIndx]->isImm()) &&
7535         "invalid operand");
7536
7537  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7538  if (argOpers[valArgIndx]->isReg())
7539    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7540  else
7541    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7542  (*MIB).addOperand(*argOpers[valArgIndx]);
7543
7544  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
7545  MIB.addReg(t1);
7546
7547  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
7548  MIB.addReg(t1);
7549  MIB.addReg(t2);
7550
7551  // Generate movc
7552  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7553  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
7554  MIB.addReg(t2);
7555  MIB.addReg(t1);
7556
7557  // Cmp and exchange if none has modified the memory location
7558  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
7559  for (int i=0; i <= lastAddrIndx; ++i)
7560    (*MIB).addOperand(*argOpers[i]);
7561  MIB.addReg(t3);
7562  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7563  (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
7564
7565  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
7566  MIB.addReg(X86::EAX);
7567
7568  // insert branch
7569  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7570
7571  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
7572  return nextMBB;
7573}
7574
7575
7576MachineBasicBlock *
7577X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
7578                                               MachineBasicBlock *BB) const {
7579  DebugLoc dl = MI->getDebugLoc();
7580  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7581  switch (MI->getOpcode()) {
7582  default: assert(false && "Unexpected instr type to insert");
7583  case X86::CMOV_V1I64:
7584  case X86::CMOV_FR32:
7585  case X86::CMOV_FR64:
7586  case X86::CMOV_V4F32:
7587  case X86::CMOV_V2F64:
7588  case X86::CMOV_V2I64: {
7589    // To "insert" a SELECT_CC instruction, we actually have to insert the
7590    // diamond control-flow pattern.  The incoming instruction knows the
7591    // destination vreg to set, the condition code register to branch on, the
7592    // true/false values to select between, and a branch opcode to use.
7593    const BasicBlock *LLVM_BB = BB->getBasicBlock();
7594    MachineFunction::iterator It = BB;
7595    ++It;
7596
7597    //  thisMBB:
7598    //  ...
7599    //   TrueVal = ...
7600    //   cmpTY ccX, r1, r2
7601    //   bCC copy1MBB
7602    //   fallthrough --> copy0MBB
7603    MachineBasicBlock *thisMBB = BB;
7604    MachineFunction *F = BB->getParent();
7605    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
7606    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
7607    unsigned Opc =
7608      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
7609    BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
7610    F->insert(It, copy0MBB);
7611    F->insert(It, sinkMBB);
7612    // Update machine-CFG edges by transferring all successors of the current
7613    // block to the new block which will contain the Phi node for the select.
7614    sinkMBB->transferSuccessors(BB);
7615
7616    // Add the true and fallthrough blocks as its successors.
7617    BB->addSuccessor(copy0MBB);
7618    BB->addSuccessor(sinkMBB);
7619
7620    //  copy0MBB:
7621    //   %FalseValue = ...
7622    //   # fallthrough to sinkMBB
7623    BB = copy0MBB;
7624
7625    // Update machine-CFG edges
7626    BB->addSuccessor(sinkMBB);
7627
7628    //  sinkMBB:
7629    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
7630    //  ...
7631    BB = sinkMBB;
7632    BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg())
7633      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
7634      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
7635
7636    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
7637    return BB;
7638  }
7639
7640  case X86::FP32_TO_INT16_IN_MEM:
7641  case X86::FP32_TO_INT32_IN_MEM:
7642  case X86::FP32_TO_INT64_IN_MEM:
7643  case X86::FP64_TO_INT16_IN_MEM:
7644  case X86::FP64_TO_INT32_IN_MEM:
7645  case X86::FP64_TO_INT64_IN_MEM:
7646  case X86::FP80_TO_INT16_IN_MEM:
7647  case X86::FP80_TO_INT32_IN_MEM:
7648  case X86::FP80_TO_INT64_IN_MEM: {
7649    // Change the floating point control register to use "round towards zero"
7650    // mode when truncating to an integer value.
7651    MachineFunction *F = BB->getParent();
7652    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
7653    addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx);
7654
7655    // Load the old value of the high byte of the control word...
7656    unsigned OldCW =
7657      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
7658    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW),
7659                      CWFrameIdx);
7660
7661    // Set the high part to be round to zero...
7662    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx)
7663      .addImm(0xC7F);
7664
7665    // Reload the modified control word now...
7666    addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
7667
7668    // Restore the memory image of control word to original value
7669    addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx)
7670      .addReg(OldCW);
7671
7672    // Get the X86 opcode to use.
7673    unsigned Opc;
7674    switch (MI->getOpcode()) {
7675    default: llvm_unreachable("illegal opcode!");
7676    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
7677    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
7678    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
7679    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
7680    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
7681    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
7682    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
7683    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
7684    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
7685    }
7686
7687    X86AddressMode AM;
7688    MachineOperand &Op = MI->getOperand(0);
7689    if (Op.isReg()) {
7690      AM.BaseType = X86AddressMode::RegBase;
7691      AM.Base.Reg = Op.getReg();
7692    } else {
7693      AM.BaseType = X86AddressMode::FrameIndexBase;
7694      AM.Base.FrameIndex = Op.getIndex();
7695    }
7696    Op = MI->getOperand(1);
7697    if (Op.isImm())
7698      AM.Scale = Op.getImm();
7699    Op = MI->getOperand(2);
7700    if (Op.isImm())
7701      AM.IndexReg = Op.getImm();
7702    Op = MI->getOperand(3);
7703    if (Op.isGlobal()) {
7704      AM.GV = Op.getGlobal();
7705    } else {
7706      AM.Disp = Op.getImm();
7707    }
7708    addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM)
7709                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
7710
7711    // Reload the original control word now.
7712    addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
7713
7714    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
7715    return BB;
7716  }
7717  case X86::ATOMAND32:
7718    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
7719                                               X86::AND32ri, X86::MOV32rm,
7720                                               X86::LCMPXCHG32, X86::MOV32rr,
7721                                               X86::NOT32r, X86::EAX,
7722                                               X86::GR32RegisterClass);
7723  case X86::ATOMOR32:
7724    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
7725                                               X86::OR32ri, X86::MOV32rm,
7726                                               X86::LCMPXCHG32, X86::MOV32rr,
7727                                               X86::NOT32r, X86::EAX,
7728                                               X86::GR32RegisterClass);
7729  case X86::ATOMXOR32:
7730    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
7731                                               X86::XOR32ri, X86::MOV32rm,
7732                                               X86::LCMPXCHG32, X86::MOV32rr,
7733                                               X86::NOT32r, X86::EAX,
7734                                               X86::GR32RegisterClass);
7735  case X86::ATOMNAND32:
7736    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
7737                                               X86::AND32ri, X86::MOV32rm,
7738                                               X86::LCMPXCHG32, X86::MOV32rr,
7739                                               X86::NOT32r, X86::EAX,
7740                                               X86::GR32RegisterClass, true);
7741  case X86::ATOMMIN32:
7742    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
7743  case X86::ATOMMAX32:
7744    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
7745  case X86::ATOMUMIN32:
7746    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
7747  case X86::ATOMUMAX32:
7748    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
7749
7750  case X86::ATOMAND16:
7751    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
7752                                               X86::AND16ri, X86::MOV16rm,
7753                                               X86::LCMPXCHG16, X86::MOV16rr,
7754                                               X86::NOT16r, X86::AX,
7755                                               X86::GR16RegisterClass);
7756  case X86::ATOMOR16:
7757    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
7758                                               X86::OR16ri, X86::MOV16rm,
7759                                               X86::LCMPXCHG16, X86::MOV16rr,
7760                                               X86::NOT16r, X86::AX,
7761                                               X86::GR16RegisterClass);
7762  case X86::ATOMXOR16:
7763    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
7764                                               X86::XOR16ri, X86::MOV16rm,
7765                                               X86::LCMPXCHG16, X86::MOV16rr,
7766                                               X86::NOT16r, X86::AX,
7767                                               X86::GR16RegisterClass);
7768  case X86::ATOMNAND16:
7769    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
7770                                               X86::AND16ri, X86::MOV16rm,
7771                                               X86::LCMPXCHG16, X86::MOV16rr,
7772                                               X86::NOT16r, X86::AX,
7773                                               X86::GR16RegisterClass, true);
7774  case X86::ATOMMIN16:
7775    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
7776  case X86::ATOMMAX16:
7777    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
7778  case X86::ATOMUMIN16:
7779    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
7780  case X86::ATOMUMAX16:
7781    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
7782
7783  case X86::ATOMAND8:
7784    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
7785                                               X86::AND8ri, X86::MOV8rm,
7786                                               X86::LCMPXCHG8, X86::MOV8rr,
7787                                               X86::NOT8r, X86::AL,
7788                                               X86::GR8RegisterClass);
7789  case X86::ATOMOR8:
7790    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
7791                                               X86::OR8ri, X86::MOV8rm,
7792                                               X86::LCMPXCHG8, X86::MOV8rr,
7793                                               X86::NOT8r, X86::AL,
7794                                               X86::GR8RegisterClass);
7795  case X86::ATOMXOR8:
7796    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
7797                                               X86::XOR8ri, X86::MOV8rm,
7798                                               X86::LCMPXCHG8, X86::MOV8rr,
7799                                               X86::NOT8r, X86::AL,
7800                                               X86::GR8RegisterClass);
7801  case X86::ATOMNAND8:
7802    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
7803                                               X86::AND8ri, X86::MOV8rm,
7804                                               X86::LCMPXCHG8, X86::MOV8rr,
7805                                               X86::NOT8r, X86::AL,
7806                                               X86::GR8RegisterClass, true);
7807  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
7808  // This group is for 64-bit host.
7809  case X86::ATOMAND64:
7810    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
7811                                               X86::AND64ri32, X86::MOV64rm,
7812                                               X86::LCMPXCHG64, X86::MOV64rr,
7813                                               X86::NOT64r, X86::RAX,
7814                                               X86::GR64RegisterClass);
7815  case X86::ATOMOR64:
7816    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
7817                                               X86::OR64ri32, X86::MOV64rm,
7818                                               X86::LCMPXCHG64, X86::MOV64rr,
7819                                               X86::NOT64r, X86::RAX,
7820                                               X86::GR64RegisterClass);
7821  case X86::ATOMXOR64:
7822    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
7823                                               X86::XOR64ri32, X86::MOV64rm,
7824                                               X86::LCMPXCHG64, X86::MOV64rr,
7825                                               X86::NOT64r, X86::RAX,
7826                                               X86::GR64RegisterClass);
7827  case X86::ATOMNAND64:
7828    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
7829                                               X86::AND64ri32, X86::MOV64rm,
7830                                               X86::LCMPXCHG64, X86::MOV64rr,
7831                                               X86::NOT64r, X86::RAX,
7832                                               X86::GR64RegisterClass, true);
7833  case X86::ATOMMIN64:
7834    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
7835  case X86::ATOMMAX64:
7836    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
7837  case X86::ATOMUMIN64:
7838    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
7839  case X86::ATOMUMAX64:
7840    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
7841
7842  // This group does 64-bit operations on a 32-bit host.
7843  case X86::ATOMAND6432:
7844    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7845                                               X86::AND32rr, X86::AND32rr,
7846                                               X86::AND32ri, X86::AND32ri,
7847                                               false);
7848  case X86::ATOMOR6432:
7849    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7850                                               X86::OR32rr, X86::OR32rr,
7851                                               X86::OR32ri, X86::OR32ri,
7852                                               false);
7853  case X86::ATOMXOR6432:
7854    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7855                                               X86::XOR32rr, X86::XOR32rr,
7856                                               X86::XOR32ri, X86::XOR32ri,
7857                                               false);
7858  case X86::ATOMNAND6432:
7859    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7860                                               X86::AND32rr, X86::AND32rr,
7861                                               X86::AND32ri, X86::AND32ri,
7862                                               true);
7863  case X86::ATOMADD6432:
7864    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7865                                               X86::ADD32rr, X86::ADC32rr,
7866                                               X86::ADD32ri, X86::ADC32ri,
7867                                               false);
7868  case X86::ATOMSUB6432:
7869    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7870                                               X86::SUB32rr, X86::SBB32rr,
7871                                               X86::SUB32ri, X86::SBB32ri,
7872                                               false);
7873  case X86::ATOMSWAP6432:
7874    return EmitAtomicBit6432WithCustomInserter(MI, BB,
7875                                               X86::MOV32rr, X86::MOV32rr,
7876                                               X86::MOV32ri, X86::MOV32ri,
7877                                               false);
7878  }
7879}
7880
7881//===----------------------------------------------------------------------===//
7882//                           X86 Optimization Hooks
7883//===----------------------------------------------------------------------===//
7884
7885void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
7886                                                       const APInt &Mask,
7887                                                       APInt &KnownZero,
7888                                                       APInt &KnownOne,
7889                                                       const SelectionDAG &DAG,
7890                                                       unsigned Depth) const {
7891  unsigned Opc = Op.getOpcode();
7892  assert((Opc >= ISD::BUILTIN_OP_END ||
7893          Opc == ISD::INTRINSIC_WO_CHAIN ||
7894          Opc == ISD::INTRINSIC_W_CHAIN ||
7895          Opc == ISD::INTRINSIC_VOID) &&
7896         "Should use MaskedValueIsZero if you don't know whether Op"
7897         " is a target node!");
7898
7899  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
7900  switch (Opc) {
7901  default: break;
7902  case X86ISD::ADD:
7903  case X86ISD::SUB:
7904  case X86ISD::SMUL:
7905  case X86ISD::UMUL:
7906  case X86ISD::INC:
7907  case X86ISD::DEC:
7908    // These nodes' second result is a boolean.
7909    if (Op.getResNo() == 0)
7910      break;
7911    // Fallthrough
7912  case X86ISD::SETCC:
7913    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
7914                                       Mask.getBitWidth() - 1);
7915    break;
7916  }
7917}
7918
7919/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
7920/// node is a GlobalAddress + offset.
7921bool X86TargetLowering::isGAPlusOffset(SDNode *N,
7922                                       GlobalValue* &GA, int64_t &Offset) const{
7923  if (N->getOpcode() == X86ISD::Wrapper) {
7924    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
7925      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
7926      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
7927      return true;
7928    }
7929  }
7930  return TargetLowering::isGAPlusOffset(N, GA, Offset);
7931}
7932
7933static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
7934                               const TargetLowering &TLI) {
7935  GlobalValue *GV;
7936  int64_t Offset = 0;
7937  if (TLI.isGAPlusOffset(Base, GV, Offset))
7938    return (GV->getAlignment() >= N && (Offset % N) == 0);
7939  // DAG combine handles the stack object case.
7940  return false;
7941}
7942
7943static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
7944                                     MVT EVT, LoadSDNode *&LDBase,
7945                                     unsigned &LastLoadedElt,
7946                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
7947                                     const TargetLowering &TLI) {
7948  LDBase = NULL;
7949  LastLoadedElt = -1U;
7950  for (unsigned i = 0; i < NumElems; ++i) {
7951    if (N->getMaskElt(i) < 0) {
7952      if (!LDBase)
7953        return false;
7954      continue;
7955    }
7956
7957    SDValue Elt = DAG.getShuffleScalarElt(N, i);
7958    if (!Elt.getNode() ||
7959        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
7960      return false;
7961    if (!LDBase) {
7962      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
7963        return false;
7964      LDBase = cast<LoadSDNode>(Elt.getNode());
7965      LastLoadedElt = i;
7966      continue;
7967    }
7968    if (Elt.getOpcode() == ISD::UNDEF)
7969      continue;
7970
7971    LoadSDNode *LD = cast<LoadSDNode>(Elt);
7972    if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
7973      return false;
7974    LastLoadedElt = i;
7975  }
7976  return true;
7977}
7978
7979/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
7980/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
7981/// if the load addresses are consecutive, non-overlapping, and in the right
7982/// order.  In the case of v2i64, it will see if it can rewrite the
7983/// shuffle to be an appropriate build vector so it can take advantage of
7984// performBuildVectorCombine.
7985static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
7986                                     const TargetLowering &TLI) {
7987  DebugLoc dl = N->getDebugLoc();
7988  MVT VT = N->getValueType(0);
7989  MVT EVT = VT.getVectorElementType();
7990  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
7991  unsigned NumElems = VT.getVectorNumElements();
7992
7993  if (VT.getSizeInBits() != 128)
7994    return SDValue();
7995
7996  // Try to combine a vector_shuffle into a 128-bit load.
7997  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7998  LoadSDNode *LD = NULL;
7999  unsigned LastLoadedElt;
8000  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG,
8001                                MFI, TLI))
8002    return SDValue();
8003
8004  if (LastLoadedElt == NumElems - 1) {
8005    if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI))
8006      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8007                         LD->getSrcValue(), LD->getSrcValueOffset(),
8008                         LD->isVolatile());
8009    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8010                       LD->getSrcValue(), LD->getSrcValueOffset(),
8011                       LD->isVolatile(), LD->getAlignment());
8012  } else if (NumElems == 4 && LastLoadedElt == 1) {
8013    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
8014    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
8015    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
8016    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
8017  }
8018  return SDValue();
8019}
8020
8021/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
8022static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
8023                                    const X86Subtarget *Subtarget) {
8024  DebugLoc DL = N->getDebugLoc();
8025  SDValue Cond = N->getOperand(0);
8026  // Get the LHS/RHS of the select.
8027  SDValue LHS = N->getOperand(1);
8028  SDValue RHS = N->getOperand(2);
8029
8030  // If we have SSE[12] support, try to form min/max nodes.
8031  if (Subtarget->hasSSE2() &&
8032      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
8033      Cond.getOpcode() == ISD::SETCC) {
8034    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
8035
8036    unsigned Opcode = 0;
8037    if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
8038      switch (CC) {
8039      default: break;
8040      case ISD::SETOLE: // (X <= Y) ? X : Y -> min
8041      case ISD::SETULE:
8042      case ISD::SETLE:
8043        if (!UnsafeFPMath) break;
8044        // FALL THROUGH.
8045      case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
8046      case ISD::SETLT:
8047        Opcode = X86ISD::FMIN;
8048        break;
8049
8050      case ISD::SETOGT: // (X > Y) ? X : Y -> max
8051      case ISD::SETUGT:
8052      case ISD::SETGT:
8053        if (!UnsafeFPMath) break;
8054        // FALL THROUGH.
8055      case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
8056      case ISD::SETGE:
8057        Opcode = X86ISD::FMAX;
8058        break;
8059      }
8060    } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
8061      switch (CC) {
8062      default: break;
8063      case ISD::SETOGT: // (X > Y) ? Y : X -> min
8064      case ISD::SETUGT:
8065      case ISD::SETGT:
8066        if (!UnsafeFPMath) break;
8067        // FALL THROUGH.
8068      case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
8069      case ISD::SETGE:
8070        Opcode = X86ISD::FMIN;
8071        break;
8072
8073      case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
8074      case ISD::SETULE:
8075      case ISD::SETLE:
8076        if (!UnsafeFPMath) break;
8077        // FALL THROUGH.
8078      case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
8079      case ISD::SETLT:
8080        Opcode = X86ISD::FMAX;
8081        break;
8082      }
8083    }
8084
8085    if (Opcode)
8086      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
8087  }
8088
8089  // If this is a select between two integer constants, try to do some
8090  // optimizations.
8091  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
8092    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
8093      // Don't do this for crazy integer types.
8094      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
8095        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
8096        // so that TrueC (the true value) is larger than FalseC.
8097        bool NeedsCondInvert = false;
8098
8099        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
8100            // Efficiently invertible.
8101            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
8102             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
8103              isa<ConstantSDNode>(Cond.getOperand(1))))) {
8104          NeedsCondInvert = true;
8105          std::swap(TrueC, FalseC);
8106        }
8107
8108        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
8109        if (FalseC->getAPIntValue() == 0 &&
8110            TrueC->getAPIntValue().isPowerOf2()) {
8111          if (NeedsCondInvert) // Invert the condition if needed.
8112            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8113                               DAG.getConstant(1, Cond.getValueType()));
8114
8115          // Zero extend the condition if needed.
8116          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
8117
8118          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8119          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
8120                             DAG.getConstant(ShAmt, MVT::i8));
8121        }
8122
8123        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
8124        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8125          if (NeedsCondInvert) // Invert the condition if needed.
8126            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8127                               DAG.getConstant(1, Cond.getValueType()));
8128
8129          // Zero extend the condition if needed.
8130          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8131                             FalseC->getValueType(0), Cond);
8132          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8133                             SDValue(FalseC, 0));
8134        }
8135
8136        // Optimize cases that will turn into an LEA instruction.  This requires
8137        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
8138        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
8139          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
8140          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
8141
8142          bool isFastMultiplier = false;
8143          if (Diff < 10) {
8144            switch ((unsigned char)Diff) {
8145              default: break;
8146              case 1:  // result = add base, cond
8147              case 2:  // result = lea base(    , cond*2)
8148              case 3:  // result = lea base(cond, cond*2)
8149              case 4:  // result = lea base(    , cond*4)
8150              case 5:  // result = lea base(cond, cond*4)
8151              case 8:  // result = lea base(    , cond*8)
8152              case 9:  // result = lea base(cond, cond*8)
8153                isFastMultiplier = true;
8154                break;
8155            }
8156          }
8157
8158          if (isFastMultiplier) {
8159            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
8160            if (NeedsCondInvert) // Invert the condition if needed.
8161              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8162                                 DAG.getConstant(1, Cond.getValueType()));
8163
8164            // Zero extend the condition if needed.
8165            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
8166                               Cond);
8167            // Scale the condition by the difference.
8168            if (Diff != 1)
8169              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
8170                                 DAG.getConstant(Diff, Cond.getValueType()));
8171
8172            // Add the base if non-zero.
8173            if (FalseC->getAPIntValue() != 0)
8174              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8175                                 SDValue(FalseC, 0));
8176            return Cond;
8177          }
8178        }
8179      }
8180  }
8181
8182  return SDValue();
8183}
8184
8185/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
8186static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
8187                                  TargetLowering::DAGCombinerInfo &DCI) {
8188  DebugLoc DL = N->getDebugLoc();
8189
8190  // If the flag operand isn't dead, don't touch this CMOV.
8191  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
8192    return SDValue();
8193
8194  // If this is a select between two integer constants, try to do some
8195  // optimizations.  Note that the operands are ordered the opposite of SELECT
8196  // operands.
8197  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
8198    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
8199      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
8200      // larger than FalseC (the false value).
8201      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
8202
8203      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
8204        CC = X86::GetOppositeBranchCondition(CC);
8205        std::swap(TrueC, FalseC);
8206      }
8207
8208      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
8209      // This is efficient for any integer data type (including i8/i16) and
8210      // shift amount.
8211      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
8212        SDValue Cond = N->getOperand(3);
8213        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8214                           DAG.getConstant(CC, MVT::i8), Cond);
8215
8216        // Zero extend the condition if needed.
8217        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
8218
8219        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8220        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
8221                           DAG.getConstant(ShAmt, MVT::i8));
8222        if (N->getNumValues() == 2)  // Dead flag value?
8223          return DCI.CombineTo(N, Cond, SDValue());
8224        return Cond;
8225      }
8226
8227      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
8228      // for any integer data type, including i8/i16.
8229      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8230        SDValue Cond = N->getOperand(3);
8231        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8232                           DAG.getConstant(CC, MVT::i8), Cond);
8233
8234        // Zero extend the condition if needed.
8235        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8236                           FalseC->getValueType(0), Cond);
8237        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8238                           SDValue(FalseC, 0));
8239
8240        if (N->getNumValues() == 2)  // Dead flag value?
8241          return DCI.CombineTo(N, Cond, SDValue());
8242        return Cond;
8243      }
8244
8245      // Optimize cases that will turn into an LEA instruction.  This requires
8246      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
8247      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
8248        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
8249        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
8250
8251        bool isFastMultiplier = false;
8252        if (Diff < 10) {
8253          switch ((unsigned char)Diff) {
8254          default: break;
8255          case 1:  // result = add base, cond
8256          case 2:  // result = lea base(    , cond*2)
8257          case 3:  // result = lea base(cond, cond*2)
8258          case 4:  // result = lea base(    , cond*4)
8259          case 5:  // result = lea base(cond, cond*4)
8260          case 8:  // result = lea base(    , cond*8)
8261          case 9:  // result = lea base(cond, cond*8)
8262            isFastMultiplier = true;
8263            break;
8264          }
8265        }
8266
8267        if (isFastMultiplier) {
8268          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
8269          SDValue Cond = N->getOperand(3);
8270          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8271                             DAG.getConstant(CC, MVT::i8), Cond);
8272          // Zero extend the condition if needed.
8273          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
8274                             Cond);
8275          // Scale the condition by the difference.
8276          if (Diff != 1)
8277            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
8278                               DAG.getConstant(Diff, Cond.getValueType()));
8279
8280          // Add the base if non-zero.
8281          if (FalseC->getAPIntValue() != 0)
8282            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8283                               SDValue(FalseC, 0));
8284          if (N->getNumValues() == 2)  // Dead flag value?
8285            return DCI.CombineTo(N, Cond, SDValue());
8286          return Cond;
8287        }
8288      }
8289    }
8290  }
8291  return SDValue();
8292}
8293
8294
8295/// PerformMulCombine - Optimize a single multiply with constant into two
8296/// in order to implement it with two cheaper instructions, e.g.
8297/// LEA + SHL, LEA + LEA.
8298static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
8299                                 TargetLowering::DAGCombinerInfo &DCI) {
8300  if (DAG.getMachineFunction().
8301      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
8302    return SDValue();
8303
8304  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8305    return SDValue();
8306
8307  MVT VT = N->getValueType(0);
8308  if (VT != MVT::i64)
8309    return SDValue();
8310
8311  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8312  if (!C)
8313    return SDValue();
8314  uint64_t MulAmt = C->getZExtValue();
8315  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
8316    return SDValue();
8317
8318  uint64_t MulAmt1 = 0;
8319  uint64_t MulAmt2 = 0;
8320  if ((MulAmt % 9) == 0) {
8321    MulAmt1 = 9;
8322    MulAmt2 = MulAmt / 9;
8323  } else if ((MulAmt % 5) == 0) {
8324    MulAmt1 = 5;
8325    MulAmt2 = MulAmt / 5;
8326  } else if ((MulAmt % 3) == 0) {
8327    MulAmt1 = 3;
8328    MulAmt2 = MulAmt / 3;
8329  }
8330  if (MulAmt2 &&
8331      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
8332    DebugLoc DL = N->getDebugLoc();
8333
8334    if (isPowerOf2_64(MulAmt2) &&
8335        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
8336      // If second multiplifer is pow2, issue it first. We want the multiply by
8337      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
8338      // is an add.
8339      std::swap(MulAmt1, MulAmt2);
8340
8341    SDValue NewMul;
8342    if (isPowerOf2_64(MulAmt1))
8343      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
8344                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
8345    else
8346      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
8347                           DAG.getConstant(MulAmt1, VT));
8348
8349    if (isPowerOf2_64(MulAmt2))
8350      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
8351                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
8352    else
8353      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
8354                           DAG.getConstant(MulAmt2, VT));
8355
8356    // Do not add new nodes to DAG combiner worklist.
8357    DCI.CombineTo(N, NewMul, false);
8358  }
8359  return SDValue();
8360}
8361
8362
8363/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
8364///                       when possible.
8365static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
8366                                   const X86Subtarget *Subtarget) {
8367  // On X86 with SSE2 support, we can transform this to a vector shift if
8368  // all elements are shifted by the same amount.  We can't do this in legalize
8369  // because the a constant vector is typically transformed to a constant pool
8370  // so we have no knowledge of the shift amount.
8371  if (!Subtarget->hasSSE2())
8372    return SDValue();
8373
8374  MVT VT = N->getValueType(0);
8375  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
8376    return SDValue();
8377
8378  SDValue ShAmtOp = N->getOperand(1);
8379  MVT EltVT = VT.getVectorElementType();
8380  DebugLoc DL = N->getDebugLoc();
8381  SDValue BaseShAmt;
8382  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
8383    unsigned NumElts = VT.getVectorNumElements();
8384    unsigned i = 0;
8385    for (; i != NumElts; ++i) {
8386      SDValue Arg = ShAmtOp.getOperand(i);
8387      if (Arg.getOpcode() == ISD::UNDEF) continue;
8388      BaseShAmt = Arg;
8389      break;
8390    }
8391    for (; i != NumElts; ++i) {
8392      SDValue Arg = ShAmtOp.getOperand(i);
8393      if (Arg.getOpcode() == ISD::UNDEF) continue;
8394      if (Arg != BaseShAmt) {
8395        return SDValue();
8396      }
8397    }
8398  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
8399             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
8400    BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
8401                            DAG.getIntPtrConstant(0));
8402  } else
8403    return SDValue();
8404
8405  if (EltVT.bitsGT(MVT::i32))
8406    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
8407  else if (EltVT.bitsLT(MVT::i32))
8408    BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt);
8409
8410  // The shift amount is identical so we can do a vector shift.
8411  SDValue  ValOp = N->getOperand(0);
8412  switch (N->getOpcode()) {
8413  default:
8414    llvm_unreachable("Unknown shift opcode!");
8415    break;
8416  case ISD::SHL:
8417    if (VT == MVT::v2i64)
8418      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8419                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
8420                         ValOp, BaseShAmt);
8421    if (VT == MVT::v4i32)
8422      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8423                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
8424                         ValOp, BaseShAmt);
8425    if (VT == MVT::v8i16)
8426      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8427                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
8428                         ValOp, BaseShAmt);
8429    break;
8430  case ISD::SRA:
8431    if (VT == MVT::v4i32)
8432      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8433                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
8434                         ValOp, BaseShAmt);
8435    if (VT == MVT::v8i16)
8436      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8437                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
8438                         ValOp, BaseShAmt);
8439    break;
8440  case ISD::SRL:
8441    if (VT == MVT::v2i64)
8442      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8443                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
8444                         ValOp, BaseShAmt);
8445    if (VT == MVT::v4i32)
8446      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8447                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
8448                         ValOp, BaseShAmt);
8449    if (VT ==  MVT::v8i16)
8450      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8451                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
8452                         ValOp, BaseShAmt);
8453    break;
8454  }
8455  return SDValue();
8456}
8457
8458/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
8459static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
8460                                   const X86Subtarget *Subtarget) {
8461  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
8462  // the FP state in cases where an emms may be missing.
8463  // A preferable solution to the general problem is to figure out the right
8464  // places to insert EMMS.  This qualifies as a quick hack.
8465
8466  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
8467  StoreSDNode *St = cast<StoreSDNode>(N);
8468  MVT VT = St->getValue().getValueType();
8469  if (VT.getSizeInBits() != 64)
8470    return SDValue();
8471
8472  const Function *F = DAG.getMachineFunction().getFunction();
8473  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
8474  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
8475    && Subtarget->hasSSE2();
8476  if ((VT.isVector() ||
8477       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
8478      isa<LoadSDNode>(St->getValue()) &&
8479      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
8480      St->getChain().hasOneUse() && !St->isVolatile()) {
8481    SDNode* LdVal = St->getValue().getNode();
8482    LoadSDNode *Ld = 0;
8483    int TokenFactorIndex = -1;
8484    SmallVector<SDValue, 8> Ops;
8485    SDNode* ChainVal = St->getChain().getNode();
8486    // Must be a store of a load.  We currently handle two cases:  the load
8487    // is a direct child, and it's under an intervening TokenFactor.  It is
8488    // possible to dig deeper under nested TokenFactors.
8489    if (ChainVal == LdVal)
8490      Ld = cast<LoadSDNode>(St->getChain());
8491    else if (St->getValue().hasOneUse() &&
8492             ChainVal->getOpcode() == ISD::TokenFactor) {
8493      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
8494        if (ChainVal->getOperand(i).getNode() == LdVal) {
8495          TokenFactorIndex = i;
8496          Ld = cast<LoadSDNode>(St->getValue());
8497        } else
8498          Ops.push_back(ChainVal->getOperand(i));
8499      }
8500    }
8501
8502    if (!Ld || !ISD::isNormalLoad(Ld))
8503      return SDValue();
8504
8505    // If this is not the MMX case, i.e. we are just turning i64 load/store
8506    // into f64 load/store, avoid the transformation if there are multiple
8507    // uses of the loaded value.
8508    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
8509      return SDValue();
8510
8511    DebugLoc LdDL = Ld->getDebugLoc();
8512    DebugLoc StDL = N->getDebugLoc();
8513    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
8514    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
8515    // pair instead.
8516    if (Subtarget->is64Bit() || F64IsLegal) {
8517      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
8518      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
8519                                  Ld->getBasePtr(), Ld->getSrcValue(),
8520                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
8521                                  Ld->getAlignment());
8522      SDValue NewChain = NewLd.getValue(1);
8523      if (TokenFactorIndex != -1) {
8524        Ops.push_back(NewChain);
8525        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
8526                               Ops.size());
8527      }
8528      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
8529                          St->getSrcValue(), St->getSrcValueOffset(),
8530                          St->isVolatile(), St->getAlignment());
8531    }
8532
8533    // Otherwise, lower to two pairs of 32-bit loads / stores.
8534    SDValue LoAddr = Ld->getBasePtr();
8535    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
8536                                 DAG.getConstant(4, MVT::i32));
8537
8538    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
8539                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
8540                               Ld->isVolatile(), Ld->getAlignment());
8541    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
8542                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
8543                               Ld->isVolatile(),
8544                               MinAlign(Ld->getAlignment(), 4));
8545
8546    SDValue NewChain = LoLd.getValue(1);
8547    if (TokenFactorIndex != -1) {
8548      Ops.push_back(LoLd);
8549      Ops.push_back(HiLd);
8550      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
8551                             Ops.size());
8552    }
8553
8554    LoAddr = St->getBasePtr();
8555    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
8556                         DAG.getConstant(4, MVT::i32));
8557
8558    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
8559                                St->getSrcValue(), St->getSrcValueOffset(),
8560                                St->isVolatile(), St->getAlignment());
8561    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
8562                                St->getSrcValue(),
8563                                St->getSrcValueOffset() + 4,
8564                                St->isVolatile(),
8565                                MinAlign(St->getAlignment(), 4));
8566    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
8567  }
8568  return SDValue();
8569}
8570
8571/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
8572/// X86ISD::FXOR nodes.
8573static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
8574  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
8575  // F[X]OR(0.0, x) -> x
8576  // F[X]OR(x, 0.0) -> x
8577  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
8578    if (C->getValueAPF().isPosZero())
8579      return N->getOperand(1);
8580  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
8581    if (C->getValueAPF().isPosZero())
8582      return N->getOperand(0);
8583  return SDValue();
8584}
8585
8586/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
8587static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
8588  // FAND(0.0, x) -> 0.0
8589  // FAND(x, 0.0) -> 0.0
8590  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
8591    if (C->getValueAPF().isPosZero())
8592      return N->getOperand(0);
8593  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
8594    if (C->getValueAPF().isPosZero())
8595      return N->getOperand(1);
8596  return SDValue();
8597}
8598
8599static SDValue PerformBTCombine(SDNode *N,
8600                                SelectionDAG &DAG,
8601                                TargetLowering::DAGCombinerInfo &DCI) {
8602  // BT ignores high bits in the bit index operand.
8603  SDValue Op1 = N->getOperand(1);
8604  if (Op1.hasOneUse()) {
8605    unsigned BitWidth = Op1.getValueSizeInBits();
8606    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
8607    APInt KnownZero, KnownOne;
8608    TargetLowering::TargetLoweringOpt TLO(DAG);
8609    TargetLowering &TLI = DAG.getTargetLoweringInfo();
8610    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
8611        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
8612      DCI.CommitTargetLoweringOpt(TLO);
8613  }
8614  return SDValue();
8615}
8616
8617static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
8618  SDValue Op = N->getOperand(0);
8619  if (Op.getOpcode() == ISD::BIT_CONVERT)
8620    Op = Op.getOperand(0);
8621  MVT VT = N->getValueType(0), OpVT = Op.getValueType();
8622  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
8623      VT.getVectorElementType().getSizeInBits() ==
8624      OpVT.getVectorElementType().getSizeInBits()) {
8625    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
8626  }
8627  return SDValue();
8628}
8629
8630// On X86 and X86-64, atomic operations are lowered to locked instructions.
8631// Locked instructions, in turn, have implicit fence semantics (all memory
8632// operations are flushed before issuing the locked instruction, and the
8633// are not buffered), so we can fold away the common pattern of
8634// fence-atomic-fence.
8635static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
8636  SDValue atomic = N->getOperand(0);
8637  switch (atomic.getOpcode()) {
8638    case ISD::ATOMIC_CMP_SWAP:
8639    case ISD::ATOMIC_SWAP:
8640    case ISD::ATOMIC_LOAD_ADD:
8641    case ISD::ATOMIC_LOAD_SUB:
8642    case ISD::ATOMIC_LOAD_AND:
8643    case ISD::ATOMIC_LOAD_OR:
8644    case ISD::ATOMIC_LOAD_XOR:
8645    case ISD::ATOMIC_LOAD_NAND:
8646    case ISD::ATOMIC_LOAD_MIN:
8647    case ISD::ATOMIC_LOAD_MAX:
8648    case ISD::ATOMIC_LOAD_UMIN:
8649    case ISD::ATOMIC_LOAD_UMAX:
8650      break;
8651    default:
8652      return SDValue();
8653  }
8654
8655  SDValue fence = atomic.getOperand(0);
8656  if (fence.getOpcode() != ISD::MEMBARRIER)
8657    return SDValue();
8658
8659  switch (atomic.getOpcode()) {
8660    case ISD::ATOMIC_CMP_SWAP:
8661      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
8662                                    atomic.getOperand(1), atomic.getOperand(2),
8663                                    atomic.getOperand(3));
8664    case ISD::ATOMIC_SWAP:
8665    case ISD::ATOMIC_LOAD_ADD:
8666    case ISD::ATOMIC_LOAD_SUB:
8667    case ISD::ATOMIC_LOAD_AND:
8668    case ISD::ATOMIC_LOAD_OR:
8669    case ISD::ATOMIC_LOAD_XOR:
8670    case ISD::ATOMIC_LOAD_NAND:
8671    case ISD::ATOMIC_LOAD_MIN:
8672    case ISD::ATOMIC_LOAD_MAX:
8673    case ISD::ATOMIC_LOAD_UMIN:
8674    case ISD::ATOMIC_LOAD_UMAX:
8675      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
8676                                    atomic.getOperand(1), atomic.getOperand(2));
8677    default:
8678      return SDValue();
8679  }
8680}
8681
8682SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
8683                                             DAGCombinerInfo &DCI) const {
8684  SelectionDAG &DAG = DCI.DAG;
8685  switch (N->getOpcode()) {
8686  default: break;
8687  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
8688  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
8689  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
8690  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
8691  case ISD::SHL:
8692  case ISD::SRA:
8693  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
8694  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
8695  case X86ISD::FXOR:
8696  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
8697  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
8698  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
8699  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
8700  case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
8701  }
8702
8703  return SDValue();
8704}
8705
8706//===----------------------------------------------------------------------===//
8707//                           X86 Inline Assembly Support
8708//===----------------------------------------------------------------------===//
8709
8710static bool LowerToBSwap(CallInst *CI) {
8711  // FIXME: this should verify that we are targetting a 486 or better.  If not,
8712  // we will turn this bswap into something that will be lowered to logical ops
8713  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
8714  // so don't worry about this.
8715
8716  // Verify this is a simple bswap.
8717  if (CI->getNumOperands() != 2 ||
8718      CI->getType() != CI->getOperand(1)->getType() ||
8719      !CI->getType()->isInteger())
8720    return false;
8721
8722  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
8723  if (!Ty || Ty->getBitWidth() % 16 != 0)
8724    return false;
8725
8726  // Okay, we can do this xform, do so now.
8727  const Type *Tys[] = { Ty };
8728  Module *M = CI->getParent()->getParent()->getParent();
8729  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
8730
8731  Value *Op = CI->getOperand(1);
8732  Op = CallInst::Create(Int, Op, CI->getName(), CI);
8733
8734  CI->replaceAllUsesWith(Op);
8735  CI->eraseFromParent();
8736  return true;
8737}
8738
8739bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
8740  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
8741  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
8742
8743  std::string AsmStr = IA->getAsmString();
8744
8745  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
8746  std::vector<std::string> AsmPieces;
8747  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
8748
8749  switch (AsmPieces.size()) {
8750  default: return false;
8751  case 1:
8752    AsmStr = AsmPieces[0];
8753    AsmPieces.clear();
8754    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
8755
8756    // bswap $0
8757    if (AsmPieces.size() == 2 &&
8758        (AsmPieces[0] == "bswap" ||
8759         AsmPieces[0] == "bswapq" ||
8760         AsmPieces[0] == "bswapl") &&
8761        (AsmPieces[1] == "$0" ||
8762         AsmPieces[1] == "${0:q}")) {
8763      // No need to check constraints, nothing other than the equivalent of
8764      // "=r,0" would be valid here.
8765      return LowerToBSwap(CI);
8766    }
8767    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
8768    if (CI->getType() == Type::Int16Ty &&
8769        AsmPieces.size() == 3 &&
8770        AsmPieces[0] == "rorw" &&
8771        AsmPieces[1] == "$$8," &&
8772        AsmPieces[2] == "${0:w}" &&
8773        IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
8774      return LowerToBSwap(CI);
8775    }
8776    break;
8777  case 3:
8778    if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 &&
8779        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
8780        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
8781      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
8782      std::vector<std::string> Words;
8783      SplitString(AsmPieces[0], Words, " \t");
8784      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
8785        Words.clear();
8786        SplitString(AsmPieces[1], Words, " \t");
8787        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
8788          Words.clear();
8789          SplitString(AsmPieces[2], Words, " \t,");
8790          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
8791              Words[2] == "%edx") {
8792            return LowerToBSwap(CI);
8793          }
8794        }
8795      }
8796    }
8797    break;
8798  }
8799  return false;
8800}
8801
8802
8803
8804/// getConstraintType - Given a constraint letter, return the type of
8805/// constraint it is for this target.
8806X86TargetLowering::ConstraintType
8807X86TargetLowering::getConstraintType(const std::string &Constraint) const {
8808  if (Constraint.size() == 1) {
8809    switch (Constraint[0]) {
8810    case 'A':
8811      return C_Register;
8812    case 'f':
8813    case 'r':
8814    case 'R':
8815    case 'l':
8816    case 'q':
8817    case 'Q':
8818    case 'x':
8819    case 'y':
8820    case 'Y':
8821      return C_RegisterClass;
8822    case 'e':
8823    case 'Z':
8824      return C_Other;
8825    default:
8826      break;
8827    }
8828  }
8829  return TargetLowering::getConstraintType(Constraint);
8830}
8831
8832/// LowerXConstraint - try to replace an X constraint, which matches anything,
8833/// with another that has more specific requirements based on the type of the
8834/// corresponding operand.
8835const char *X86TargetLowering::
8836LowerXConstraint(MVT ConstraintVT) const {
8837  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
8838  // 'f' like normal targets.
8839  if (ConstraintVT.isFloatingPoint()) {
8840    if (Subtarget->hasSSE2())
8841      return "Y";
8842    if (Subtarget->hasSSE1())
8843      return "x";
8844  }
8845
8846  return TargetLowering::LowerXConstraint(ConstraintVT);
8847}
8848
8849/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
8850/// vector.  If it is invalid, don't add anything to Ops.
8851void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
8852                                                     char Constraint,
8853                                                     bool hasMemory,
8854                                                     std::vector<SDValue>&Ops,
8855                                                     SelectionDAG &DAG) const {
8856  SDValue Result(0, 0);
8857
8858  switch (Constraint) {
8859  default: break;
8860  case 'I':
8861    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8862      if (C->getZExtValue() <= 31) {
8863        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8864        break;
8865      }
8866    }
8867    return;
8868  case 'J':
8869    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8870      if (C->getZExtValue() <= 63) {
8871        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8872        break;
8873      }
8874    }
8875    return;
8876  case 'K':
8877    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8878      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
8879        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8880        break;
8881      }
8882    }
8883    return;
8884  case 'N':
8885    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8886      if (C->getZExtValue() <= 255) {
8887        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8888        break;
8889      }
8890    }
8891    return;
8892  case 'e': {
8893    // 32-bit signed value
8894    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8895      const ConstantInt *CI = C->getConstantIntValue();
8896      if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) {
8897        // Widen to 64 bits here to get it sign extended.
8898        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
8899        break;
8900      }
8901    // FIXME gcc accepts some relocatable values here too, but only in certain
8902    // memory models; it's complicated.
8903    }
8904    return;
8905  }
8906  case 'Z': {
8907    // 32-bit unsigned value
8908    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8909      const ConstantInt *CI = C->getConstantIntValue();
8910      if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) {
8911        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8912        break;
8913      }
8914    }
8915    // FIXME gcc accepts some relocatable values here too, but only in certain
8916    // memory models; it's complicated.
8917    return;
8918  }
8919  case 'i': {
8920    // Literal immediates are always ok.
8921    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
8922      // Widen to 64 bits here to get it sign extended.
8923      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
8924      break;
8925    }
8926
8927    // If we are in non-pic codegen mode, we allow the address of a global (with
8928    // an optional displacement) to be used with 'i'.
8929    GlobalAddressSDNode *GA = 0;
8930    int64_t Offset = 0;
8931
8932    // Match either (GA), (GA+C), (GA+C1+C2), etc.
8933    while (1) {
8934      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
8935        Offset += GA->getOffset();
8936        break;
8937      } else if (Op.getOpcode() == ISD::ADD) {
8938        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
8939          Offset += C->getZExtValue();
8940          Op = Op.getOperand(0);
8941          continue;
8942        }
8943      } else if (Op.getOpcode() == ISD::SUB) {
8944        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
8945          Offset += -C->getZExtValue();
8946          Op = Op.getOperand(0);
8947          continue;
8948        }
8949      }
8950
8951      // Otherwise, this isn't something we can handle, reject it.
8952      return;
8953    }
8954
8955    GlobalValue *GV = GA->getGlobal();
8956    // If we require an extra load to get this address, as in PIC mode, we
8957    // can't accept it.
8958    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
8959                                                        getTargetMachine())))
8960      return;
8961
8962    if (hasMemory)
8963      Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
8964    else
8965      Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
8966    Result = Op;
8967    break;
8968  }
8969  }
8970
8971  if (Result.getNode()) {
8972    Ops.push_back(Result);
8973    return;
8974  }
8975  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
8976                                                      Ops, DAG);
8977}
8978
8979std::vector<unsigned> X86TargetLowering::
8980getRegClassForInlineAsmConstraint(const std::string &Constraint,
8981                                  MVT VT) const {
8982  if (Constraint.size() == 1) {
8983    // FIXME: not handling fp-stack yet!
8984    switch (Constraint[0]) {      // GCC X86 Constraint Letters
8985    default: break;  // Unknown constraint letter
8986    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
8987      if (Subtarget->is64Bit()) {
8988        if (VT == MVT::i32)
8989          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
8990                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
8991                                       X86::R10D,X86::R11D,X86::R12D,
8992                                       X86::R13D,X86::R14D,X86::R15D,
8993                                       X86::EBP, X86::ESP, 0);
8994        else if (VT == MVT::i16)
8995          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
8996                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
8997                                       X86::R10W,X86::R11W,X86::R12W,
8998                                       X86::R13W,X86::R14W,X86::R15W,
8999                                       X86::BP,  X86::SP, 0);
9000        else if (VT == MVT::i8)
9001          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
9002                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
9003                                       X86::R10B,X86::R11B,X86::R12B,
9004                                       X86::R13B,X86::R14B,X86::R15B,
9005                                       X86::BPL, X86::SPL, 0);
9006
9007        else if (VT == MVT::i64)
9008          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
9009                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
9010                                       X86::R10, X86::R11, X86::R12,
9011                                       X86::R13, X86::R14, X86::R15,
9012                                       X86::RBP, X86::RSP, 0);
9013
9014        break;
9015      }
9016      // 32-bit fallthrough
9017    case 'Q':   // Q_REGS
9018      if (VT == MVT::i32)
9019        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
9020      else if (VT == MVT::i16)
9021        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
9022      else if (VT == MVT::i8)
9023        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
9024      else if (VT == MVT::i64)
9025        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
9026      break;
9027    }
9028  }
9029
9030  return std::vector<unsigned>();
9031}
9032
9033std::pair<unsigned, const TargetRegisterClass*>
9034X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
9035                                                MVT VT) const {
9036  // First, see if this is a constraint that directly corresponds to an LLVM
9037  // register class.
9038  if (Constraint.size() == 1) {
9039    // GCC Constraint Letters
9040    switch (Constraint[0]) {
9041    default: break;
9042    case 'r':   // GENERAL_REGS
9043    case 'R':   // LEGACY_REGS
9044    case 'l':   // INDEX_REGS
9045      if (VT == MVT::i8)
9046        return std::make_pair(0U, X86::GR8RegisterClass);
9047      if (VT == MVT::i16)
9048        return std::make_pair(0U, X86::GR16RegisterClass);
9049      if (VT == MVT::i32 || !Subtarget->is64Bit())
9050        return std::make_pair(0U, X86::GR32RegisterClass);
9051      return std::make_pair(0U, X86::GR64RegisterClass);
9052    case 'f':  // FP Stack registers.
9053      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
9054      // value to the correct fpstack register class.
9055      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
9056        return std::make_pair(0U, X86::RFP32RegisterClass);
9057      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
9058        return std::make_pair(0U, X86::RFP64RegisterClass);
9059      return std::make_pair(0U, X86::RFP80RegisterClass);
9060    case 'y':   // MMX_REGS if MMX allowed.
9061      if (!Subtarget->hasMMX()) break;
9062      return std::make_pair(0U, X86::VR64RegisterClass);
9063    case 'Y':   // SSE_REGS if SSE2 allowed
9064      if (!Subtarget->hasSSE2()) break;
9065      // FALL THROUGH.
9066    case 'x':   // SSE_REGS if SSE1 allowed
9067      if (!Subtarget->hasSSE1()) break;
9068
9069      switch (VT.getSimpleVT()) {
9070      default: break;
9071      // Scalar SSE types.
9072      case MVT::f32:
9073      case MVT::i32:
9074        return std::make_pair(0U, X86::FR32RegisterClass);
9075      case MVT::f64:
9076      case MVT::i64:
9077        return std::make_pair(0U, X86::FR64RegisterClass);
9078      // Vector types.
9079      case MVT::v16i8:
9080      case MVT::v8i16:
9081      case MVT::v4i32:
9082      case MVT::v2i64:
9083      case MVT::v4f32:
9084      case MVT::v2f64:
9085        return std::make_pair(0U, X86::VR128RegisterClass);
9086      }
9087      break;
9088    }
9089  }
9090
9091  // Use the default implementation in TargetLowering to convert the register
9092  // constraint into a member of a register class.
9093  std::pair<unsigned, const TargetRegisterClass*> Res;
9094  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
9095
9096  // Not found as a standard register?
9097  if (Res.second == 0) {
9098    // GCC calls "st(0)" just plain "st".
9099    if (StringsEqualNoCase("{st}", Constraint)) {
9100      Res.first = X86::ST0;
9101      Res.second = X86::RFP80RegisterClass;
9102    }
9103    // 'A' means EAX + EDX.
9104    if (Constraint == "A") {
9105      Res.first = X86::EAX;
9106      Res.second = X86::GR32_ADRegisterClass;
9107    }
9108    return Res;
9109  }
9110
9111  // Otherwise, check to see if this is a register class of the wrong value
9112  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
9113  // turn into {ax},{dx}.
9114  if (Res.second->hasType(VT))
9115    return Res;   // Correct type already, nothing to do.
9116
9117  // All of the single-register GCC register classes map their values onto
9118  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
9119  // really want an 8-bit or 32-bit register, map to the appropriate register
9120  // class and return the appropriate register.
9121  if (Res.second == X86::GR16RegisterClass) {
9122    if (VT == MVT::i8) {
9123      unsigned DestReg = 0;
9124      switch (Res.first) {
9125      default: break;
9126      case X86::AX: DestReg = X86::AL; break;
9127      case X86::DX: DestReg = X86::DL; break;
9128      case X86::CX: DestReg = X86::CL; break;
9129      case X86::BX: DestReg = X86::BL; break;
9130      }
9131      if (DestReg) {
9132        Res.first = DestReg;
9133        Res.second = X86::GR8RegisterClass;
9134      }
9135    } else if (VT == MVT::i32) {
9136      unsigned DestReg = 0;
9137      switch (Res.first) {
9138      default: break;
9139      case X86::AX: DestReg = X86::EAX; break;
9140      case X86::DX: DestReg = X86::EDX; break;
9141      case X86::CX: DestReg = X86::ECX; break;
9142      case X86::BX: DestReg = X86::EBX; break;
9143      case X86::SI: DestReg = X86::ESI; break;
9144      case X86::DI: DestReg = X86::EDI; break;
9145      case X86::BP: DestReg = X86::EBP; break;
9146      case X86::SP: DestReg = X86::ESP; break;
9147      }
9148      if (DestReg) {
9149        Res.first = DestReg;
9150        Res.second = X86::GR32RegisterClass;
9151      }
9152    } else if (VT == MVT::i64) {
9153      unsigned DestReg = 0;
9154      switch (Res.first) {
9155      default: break;
9156      case X86::AX: DestReg = X86::RAX; break;
9157      case X86::DX: DestReg = X86::RDX; break;
9158      case X86::CX: DestReg = X86::RCX; break;
9159      case X86::BX: DestReg = X86::RBX; break;
9160      case X86::SI: DestReg = X86::RSI; break;
9161      case X86::DI: DestReg = X86::RDI; break;
9162      case X86::BP: DestReg = X86::RBP; break;
9163      case X86::SP: DestReg = X86::RSP; break;
9164      }
9165      if (DestReg) {
9166        Res.first = DestReg;
9167        Res.second = X86::GR64RegisterClass;
9168      }
9169    }
9170  } else if (Res.second == X86::FR32RegisterClass ||
9171             Res.second == X86::FR64RegisterClass ||
9172             Res.second == X86::VR128RegisterClass) {
9173    // Handle references to XMM physical registers that got mapped into the
9174    // wrong class.  This can happen with constraints like {xmm0} where the
9175    // target independent register mapper will just pick the first match it can
9176    // find, ignoring the required type.
9177    if (VT == MVT::f32)
9178      Res.second = X86::FR32RegisterClass;
9179    else if (VT == MVT::f64)
9180      Res.second = X86::FR64RegisterClass;
9181    else if (X86::VR128RegisterClass->hasType(VT))
9182      Res.second = X86::VR128RegisterClass;
9183  }
9184
9185  return Res;
9186}
9187
9188//===----------------------------------------------------------------------===//
9189//                           X86 Widen vector type
9190//===----------------------------------------------------------------------===//
9191
9192/// getWidenVectorType: given a vector type, returns the type to widen
9193/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
9194/// If there is no vector type that we want to widen to, returns MVT::Other
9195/// When and where to widen is target dependent based on the cost of
9196/// scalarizing vs using the wider vector type.
9197
9198MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
9199  assert(VT.isVector());
9200  if (isTypeLegal(VT))
9201    return VT;
9202
9203  // TODO: In computeRegisterProperty, we can compute the list of legal vector
9204  //       type based on element type.  This would speed up our search (though
9205  //       it may not be worth it since the size of the list is relatively
9206  //       small).
9207  MVT EltVT = VT.getVectorElementType();
9208  unsigned NElts = VT.getVectorNumElements();
9209
9210  // On X86, it make sense to widen any vector wider than 1
9211  if (NElts <= 1)
9212    return MVT::Other;
9213
9214  for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
9215       nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
9216    MVT SVT = (MVT::SimpleValueType)nVT;
9217
9218    if (isTypeLegal(SVT) &&
9219        SVT.getVectorElementType() == EltVT &&
9220        SVT.getVectorNumElements() > NElts)
9221      return SVT;
9222  }
9223  return MVT::Other;
9224}
9225