X86ISelLowering.cpp revision 3cc4a307a8f687dcf33ef947c792b6ede406db0d
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86.h"
16#include "X86InstrBuilder.h"
17#include "X86ISelLowering.h"
18#include "X86TargetMachine.h"
19#include "X86TargetObjectFile.h"
20#include "llvm/CallingConv.h"
21#include "llvm/Constants.h"
22#include "llvm/DerivedTypes.h"
23#include "llvm/GlobalAlias.h"
24#include "llvm/GlobalVariable.h"
25#include "llvm/Function.h"
26#include "llvm/Instructions.h"
27#include "llvm/Intrinsics.h"
28#include "llvm/LLVMContext.h"
29#include "llvm/ADT/BitVector.h"
30#include "llvm/ADT/VectorExtras.h"
31#include "llvm/CodeGen/MachineFrameInfo.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/PseudoSourceValue.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/Debug.h"
39#include "llvm/Support/ErrorHandling.h"
40#include "llvm/Target/TargetOptions.h"
41#include "llvm/ADT/SmallSet.h"
42#include "llvm/ADT/StringExtras.h"
43#include "llvm/Support/CommandLine.h"
44#include "llvm/Support/raw_ostream.h"
45using namespace llvm;
46
47static cl::opt<bool>
48DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
49
50// Disable16Bit - 16-bit operations typically have a larger encoding than
51// corresponding 32-bit instructions, and 16-bit code is slow on some
52// processors. This is an experimental flag to disable 16-bit operations
53// (which forces them to be Legalized to 32-bit operations).
54static cl::opt<bool>
55Disable16Bit("disable-16bit", cl::Hidden,
56             cl::desc("Disable use of 16-bit instructions"));
57
58// Forward declarations.
59static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
60                       SDValue V2);
61
62static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
63  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
64  default: llvm_unreachable("unknown subtarget type");
65  case X86Subtarget::isDarwin:
66    if (TM.getSubtarget<X86Subtarget>().is64Bit())
67      return new X8664_MachoTargetObjectFile();
68    return new X8632_MachoTargetObjectFile();
69  case X86Subtarget::isELF:
70    return new TargetLoweringObjectFileELF();
71  case X86Subtarget::isMingw:
72  case X86Subtarget::isCygwin:
73  case X86Subtarget::isWindows:
74    return new TargetLoweringObjectFileCOFF();
75  }
76
77}
78
79X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
80  : TargetLowering(TM, createTLOF(TM)) {
81  Subtarget = &TM.getSubtarget<X86Subtarget>();
82  X86ScalarSSEf64 = Subtarget->hasSSE2();
83  X86ScalarSSEf32 = Subtarget->hasSSE1();
84  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
85
86  RegInfo = TM.getRegisterInfo();
87  TD = getTargetData();
88
89  // Set up the TargetLowering object.
90
91  // X86 is weird, it always uses i8 for shift amounts and setcc results.
92  setShiftAmountType(MVT::i8);
93  setBooleanContents(ZeroOrOneBooleanContent);
94  setSchedulingPreference(SchedulingForRegPressure);
95  setStackPointerRegisterToSaveRestore(X86StackPtr);
96
97  if (Subtarget->isTargetDarwin()) {
98    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
99    setUseUnderscoreSetJmp(false);
100    setUseUnderscoreLongJmp(false);
101  } else if (Subtarget->isTargetMingw()) {
102    // MS runtime is weird: it exports _setjmp, but longjmp!
103    setUseUnderscoreSetJmp(true);
104    setUseUnderscoreLongJmp(false);
105  } else {
106    setUseUnderscoreSetJmp(true);
107    setUseUnderscoreLongJmp(true);
108  }
109
110  // Set up the register classes.
111  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
112  if (!Disable16Bit)
113    addRegisterClass(MVT::i16, X86::GR16RegisterClass);
114  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
115  if (Subtarget->is64Bit())
116    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
117
118  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
119
120  // We don't accept any truncstore of integer registers.
121  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
122  if (!Disable16Bit)
123    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
124  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
125  if (!Disable16Bit)
126    setTruncStoreAction(MVT::i32, MVT::i16, Expand);
127  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
128  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
129
130  // SETOEQ and SETUNE require checking two conditions.
131  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
132  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
133  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
134  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
135  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
136  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
137
138  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
139  // operation.
140  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
141  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
142  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
143
144  if (Subtarget->is64Bit()) {
145    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
146    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
147  } else if (!UseSoftFloat) {
148    if (X86ScalarSSEf64) {
149      // We have an impenetrably clever algorithm for ui64->double only.
150      setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
151    }
152    // We have an algorithm for SSE2, and we turn this into a 64-bit
153    // FILD for other targets.
154    setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
155  }
156
157  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
158  // this operation.
159  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
160  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
161
162  if (!UseSoftFloat) {
163    // SSE has no i16 to fp conversion, only i32
164    if (X86ScalarSSEf32) {
165      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
166      // f32 and f64 cases are Legal, f80 case is not
167      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
168    } else {
169      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
170      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
171    }
172  } else {
173    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
174    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
175  }
176
177  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
178  // are Legal, f80 is custom lowered.
179  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
180  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
181
182  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
183  // this operation.
184  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
185  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
186
187  if (X86ScalarSSEf32) {
188    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
189    // f32 and f64 cases are Legal, f80 case is not
190    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
191  } else {
192    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
193    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
194  }
195
196  // Handle FP_TO_UINT by promoting the destination to a larger signed
197  // conversion.
198  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
199  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
200  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
201
202  if (Subtarget->is64Bit()) {
203    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
204    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
205  } else if (!UseSoftFloat) {
206    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
207      // Expand FP_TO_UINT into a select.
208      // FIXME: We would like to use a Custom expander here eventually to do
209      // the optimal thing for SSE vs. the default expansion in the legalizer.
210      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
211    else
212      // With SSE3 we can use fisttpll to convert to a signed i64; without
213      // SSE, we're stuck with a fistpll.
214      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
215  }
216
217  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
218  if (!X86ScalarSSEf64) {
219    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
220    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
221  }
222
223  // Scalar integer divide and remainder are lowered to use operations that
224  // produce two results, to match the available instructions. This exposes
225  // the two-result form to trivial CSE, which is able to combine x/y and x%y
226  // into a single instruction.
227  //
228  // Scalar integer multiply-high is also lowered to use two-result
229  // operations, to match the available instructions. However, plain multiply
230  // (low) operations are left as Legal, as there are single-result
231  // instructions for this in x86. Using the two-result multiply instructions
232  // when both high and low results are needed must be arranged by dagcombine.
233  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
234  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
235  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
236  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
237  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
238  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
239  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
240  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
241  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
242  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
243  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
244  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
245  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
246  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
247  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
248  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
249  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
250  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
251  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
252  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
253  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
254  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
255  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
256  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
257
258  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
259  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
260  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
261  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
262  if (Subtarget->is64Bit())
263    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
264  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
265  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
266  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
267  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
268  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
269  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
270  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
271  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
272
273  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
274  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
275  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
276  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
277  if (Disable16Bit) {
278    setOperationAction(ISD::CTTZ           , MVT::i16  , Expand);
279    setOperationAction(ISD::CTLZ           , MVT::i16  , Expand);
280  } else {
281    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
282    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
283  }
284  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
285  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
286  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
287  if (Subtarget->is64Bit()) {
288    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
289    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
290    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
291  }
292
293  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
294  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
295
296  // These should be promoted to a larger select which is supported.
297  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
298  // X86 wants to expand cmov itself.
299  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
300  if (Disable16Bit)
301    setOperationAction(ISD::SELECT        , MVT::i16  , Expand);
302  else
303    setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
304  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
305  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
306  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
307  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
308  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
309  if (Disable16Bit)
310    setOperationAction(ISD::SETCC         , MVT::i16  , Expand);
311  else
312    setOperationAction(ISD::SETCC         , MVT::i16  , Custom);
313  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
314  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
315  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
316  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
317  if (Subtarget->is64Bit()) {
318    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
319    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
320  }
321  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
322
323  // Darwin ABI issue.
324  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
325  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
326  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
327  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
328  if (Subtarget->is64Bit())
329    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
330  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
331  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
332  if (Subtarget->is64Bit()) {
333    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
334    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
335    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
336    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
337    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
338  }
339  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
340  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
341  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
342  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
343  if (Subtarget->is64Bit()) {
344    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
345    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
346    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
347  }
348
349  if (Subtarget->hasSSE1())
350    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
351
352  if (!Subtarget->hasSSE2())
353    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
354
355  // Expand certain atomics
356  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
357  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
358  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
359  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
360
361  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
362  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
363  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
364  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
365
366  if (!Subtarget->is64Bit()) {
367    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
368    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
369    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
370    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
371    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
372    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
373    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
374  }
375
376  // FIXME - use subtarget debug flags
377  if (!Subtarget->isTargetDarwin() &&
378      !Subtarget->isTargetELF() &&
379      !Subtarget->isTargetCygMing()) {
380    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
381  }
382
383  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
384  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
385  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
386  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
387  if (Subtarget->is64Bit()) {
388    setExceptionPointerRegister(X86::RAX);
389    setExceptionSelectorRegister(X86::RDX);
390  } else {
391    setExceptionPointerRegister(X86::EAX);
392    setExceptionSelectorRegister(X86::EDX);
393  }
394  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
395  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
396
397  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
398
399  setOperationAction(ISD::TRAP, MVT::Other, Legal);
400
401  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
402  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
403  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
404  if (Subtarget->is64Bit()) {
405    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
406    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
407  } else {
408    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
409    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
410  }
411
412  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
413  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
414  if (Subtarget->is64Bit())
415    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
416  if (Subtarget->isTargetCygMing())
417    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
418  else
419    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
420
421  if (!UseSoftFloat && X86ScalarSSEf64) {
422    // f32 and f64 use SSE.
423    // Set up the FP register classes.
424    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
425    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
426
427    // Use ANDPD to simulate FABS.
428    setOperationAction(ISD::FABS , MVT::f64, Custom);
429    setOperationAction(ISD::FABS , MVT::f32, Custom);
430
431    // Use XORP to simulate FNEG.
432    setOperationAction(ISD::FNEG , MVT::f64, Custom);
433    setOperationAction(ISD::FNEG , MVT::f32, Custom);
434
435    // Use ANDPD and ORPD to simulate FCOPYSIGN.
436    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
437    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
438
439    // We don't support sin/cos/fmod
440    setOperationAction(ISD::FSIN , MVT::f64, Expand);
441    setOperationAction(ISD::FCOS , MVT::f64, Expand);
442    setOperationAction(ISD::FSIN , MVT::f32, Expand);
443    setOperationAction(ISD::FCOS , MVT::f32, Expand);
444
445    // Expand FP immediates into loads from the stack, except for the special
446    // cases we handle.
447    addLegalFPImmediate(APFloat(+0.0)); // xorpd
448    addLegalFPImmediate(APFloat(+0.0f)); // xorps
449  } else if (!UseSoftFloat && X86ScalarSSEf32) {
450    // Use SSE for f32, x87 for f64.
451    // Set up the FP register classes.
452    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
453    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
454
455    // Use ANDPS to simulate FABS.
456    setOperationAction(ISD::FABS , MVT::f32, Custom);
457
458    // Use XORP to simulate FNEG.
459    setOperationAction(ISD::FNEG , MVT::f32, Custom);
460
461    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
462
463    // Use ANDPS and ORPS to simulate FCOPYSIGN.
464    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
465    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
466
467    // We don't support sin/cos/fmod
468    setOperationAction(ISD::FSIN , MVT::f32, Expand);
469    setOperationAction(ISD::FCOS , MVT::f32, Expand);
470
471    // Special cases we handle for FP constants.
472    addLegalFPImmediate(APFloat(+0.0f)); // xorps
473    addLegalFPImmediate(APFloat(+0.0)); // FLD0
474    addLegalFPImmediate(APFloat(+1.0)); // FLD1
475    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
476    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
477
478    if (!UnsafeFPMath) {
479      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
480      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
481    }
482  } else if (!UseSoftFloat) {
483    // f32 and f64 in x87.
484    // Set up the FP register classes.
485    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
486    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
487
488    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
489    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
490    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
491    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
492
493    if (!UnsafeFPMath) {
494      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
495      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
496    }
497    addLegalFPImmediate(APFloat(+0.0)); // FLD0
498    addLegalFPImmediate(APFloat(+1.0)); // FLD1
499    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
500    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
501    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
502    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
503    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
504    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
505  }
506
507  // Long double always uses X87.
508  if (!UseSoftFloat) {
509    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
510    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
511    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
512    {
513      bool ignored;
514      APFloat TmpFlt(+0.0);
515      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
516                     &ignored);
517      addLegalFPImmediate(TmpFlt);  // FLD0
518      TmpFlt.changeSign();
519      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
520      APFloat TmpFlt2(+1.0);
521      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
522                      &ignored);
523      addLegalFPImmediate(TmpFlt2);  // FLD1
524      TmpFlt2.changeSign();
525      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
526    }
527
528    if (!UnsafeFPMath) {
529      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
530      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
531    }
532  }
533
534  // Always use a library call for pow.
535  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
536  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
537  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
538
539  setOperationAction(ISD::FLOG, MVT::f80, Expand);
540  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
541  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
542  setOperationAction(ISD::FEXP, MVT::f80, Expand);
543  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
544
545  // First set operation action for all vector types to either promote
546  // (for widening) or expand (for scalarization). Then we will selectively
547  // turn on ones that can be effectively codegen'd.
548  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
549       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
550    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
551    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
552    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
553    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
554    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
555    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
556    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
557    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
558    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
559    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
560    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
561    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
562    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
563    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
564    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
565    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
566    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
567    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
568    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
569    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
570    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
571    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
572    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
573    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
574    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
575    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
576    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
577    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
578    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
579    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
580    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
581    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
582    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
583    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
584    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
585    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
586    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
587    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
588    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
589    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
590    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
591    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
592    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
593    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
594    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
595    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
596    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
597    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
598    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
599    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
600    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
601    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
602    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
603    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
604         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
605      setTruncStoreAction((MVT::SimpleValueType)VT,
606                          (MVT::SimpleValueType)InnerVT, Expand);
607    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
608    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
609    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
610  }
611
612  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
613  // with -msoft-float, disable use of MMX as well.
614  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
615    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
616    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
617    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
618    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
619    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
620
621    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
622    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
623    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
624    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
625
626    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
627    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
628    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
629    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
630
631    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
632    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
633
634    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
635    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
636    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
637    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
638    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
639    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
640    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
641
642    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
643    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
644    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
645    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
646    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
647    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
648    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
649
650    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
651    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
652    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
653    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
654    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
655    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
656    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
657
658    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
659    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
660    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
661    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
662    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
663    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
664    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
665    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
666    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
667
668    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
669    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
670    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
671    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
672    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
673
674    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
675    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
676    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
677    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
678
679    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
680    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
681    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
682    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
683
684    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
685
686    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
687    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
688    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
689    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
690    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
691    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
692    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
693  }
694
695  if (!UseSoftFloat && Subtarget->hasSSE1()) {
696    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
697
698    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
699    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
700    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
701    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
702    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
703    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
704    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
705    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
706    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
707    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
708    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
709    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
710  }
711
712  if (!UseSoftFloat && Subtarget->hasSSE2()) {
713    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
714
715    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
716    // registers cannot be used even for integer operations.
717    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
718    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
719    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
720    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
721
722    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
723    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
724    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
725    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
726    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
727    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
728    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
729    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
730    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
731    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
732    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
733    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
734    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
735    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
736    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
737    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
738
739    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
740    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
741    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
742    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
743
744    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
745    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
746    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
747    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
748    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
749
750    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
751    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
752      EVT VT = (MVT::SimpleValueType)i;
753      // Do not attempt to custom lower non-power-of-2 vectors
754      if (!isPowerOf2_32(VT.getVectorNumElements()))
755        continue;
756      // Do not attempt to custom lower non-128-bit vectors
757      if (!VT.is128BitVector())
758        continue;
759      setOperationAction(ISD::BUILD_VECTOR,
760                         VT.getSimpleVT().SimpleTy, Custom);
761      setOperationAction(ISD::VECTOR_SHUFFLE,
762                         VT.getSimpleVT().SimpleTy, Custom);
763      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
764                         VT.getSimpleVT().SimpleTy, Custom);
765    }
766
767    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
768    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
769    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
770    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
771    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
772    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
773
774    if (Subtarget->is64Bit()) {
775      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
776      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
777    }
778
779    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
780    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
781      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
782      EVT VT = SVT;
783
784      // Do not attempt to promote non-128-bit vectors
785      if (!VT.is128BitVector()) {
786        continue;
787      }
788      setOperationAction(ISD::AND,    SVT, Promote);
789      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
790      setOperationAction(ISD::OR,     SVT, Promote);
791      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
792      setOperationAction(ISD::XOR,    SVT, Promote);
793      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
794      setOperationAction(ISD::LOAD,   SVT, Promote);
795      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
796      setOperationAction(ISD::SELECT, SVT, Promote);
797      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
798    }
799
800    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
801
802    // Custom lower v2i64 and v2f64 selects.
803    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
804    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
805    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
806    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
807
808    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
809    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
810    if (!DisableMMX && Subtarget->hasMMX()) {
811      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
812      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
813    }
814  }
815
816  if (Subtarget->hasSSE41()) {
817    // FIXME: Do we need to handle scalar-to-vector here?
818    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
819
820    // i8 and i16 vectors are custom , because the source register and source
821    // source memory operand types are not the same width.  f32 vectors are
822    // custom since the immediate controlling the insert encodes additional
823    // information.
824    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
825    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
826    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
827    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
828
829    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
830    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
831    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
832    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
833
834    if (Subtarget->is64Bit()) {
835      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
836      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
837    }
838  }
839
840  if (Subtarget->hasSSE42()) {
841    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
842  }
843
844  if (!UseSoftFloat && Subtarget->hasAVX()) {
845    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
846    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
847    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
848    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
849
850    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
851    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
852    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
853    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
854    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
855    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
856    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
857    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
858    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
859    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
860    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
861    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
862    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
863    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
864    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
865
866    // Operations to consider commented out -v16i16 v32i8
867    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
868    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
869    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
870    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
871    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
872    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
873    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
874    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
875    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
876    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
877    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
878    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
879    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
880    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
881
882    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
883    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
884    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
885    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
886
887    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
888    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
889    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
890    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
891    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
892
893    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
894    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
895    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
896    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
897    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
898    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
899
900#if 0
901    // Not sure we want to do this since there are no 256-bit integer
902    // operations in AVX
903
904    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
905    // This includes 256-bit vectors
906    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
907      EVT VT = (MVT::SimpleValueType)i;
908
909      // Do not attempt to custom lower non-power-of-2 vectors
910      if (!isPowerOf2_32(VT.getVectorNumElements()))
911        continue;
912
913      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
914      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
915      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
916    }
917
918    if (Subtarget->is64Bit()) {
919      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
920      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
921    }
922#endif
923
924#if 0
925    // Not sure we want to do this since there are no 256-bit integer
926    // operations in AVX
927
928    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
929    // Including 256-bit vectors
930    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
931      EVT VT = (MVT::SimpleValueType)i;
932
933      if (!VT.is256BitVector()) {
934        continue;
935      }
936      setOperationAction(ISD::AND,    VT, Promote);
937      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
938      setOperationAction(ISD::OR,     VT, Promote);
939      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
940      setOperationAction(ISD::XOR,    VT, Promote);
941      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
942      setOperationAction(ISD::LOAD,   VT, Promote);
943      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
944      setOperationAction(ISD::SELECT, VT, Promote);
945      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
946    }
947
948    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
949#endif
950  }
951
952  // We want to custom lower some of our intrinsics.
953  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
954
955  // Add/Sub/Mul with overflow operations are custom lowered.
956  setOperationAction(ISD::SADDO, MVT::i32, Custom);
957  setOperationAction(ISD::SADDO, MVT::i64, Custom);
958  setOperationAction(ISD::UADDO, MVT::i32, Custom);
959  setOperationAction(ISD::UADDO, MVT::i64, Custom);
960  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
961  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
962  setOperationAction(ISD::USUBO, MVT::i32, Custom);
963  setOperationAction(ISD::USUBO, MVT::i64, Custom);
964  setOperationAction(ISD::SMULO, MVT::i32, Custom);
965  setOperationAction(ISD::SMULO, MVT::i64, Custom);
966
967  if (!Subtarget->is64Bit()) {
968    // These libcalls are not available in 32-bit.
969    setLibcallName(RTLIB::SHL_I128, 0);
970    setLibcallName(RTLIB::SRL_I128, 0);
971    setLibcallName(RTLIB::SRA_I128, 0);
972  }
973
974  // We have target-specific dag combine patterns for the following nodes:
975  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
976  setTargetDAGCombine(ISD::BUILD_VECTOR);
977  setTargetDAGCombine(ISD::SELECT);
978  setTargetDAGCombine(ISD::SHL);
979  setTargetDAGCombine(ISD::SRA);
980  setTargetDAGCombine(ISD::SRL);
981  setTargetDAGCombine(ISD::STORE);
982  setTargetDAGCombine(ISD::MEMBARRIER);
983  setTargetDAGCombine(ISD::ZERO_EXTEND);
984  if (Subtarget->is64Bit())
985    setTargetDAGCombine(ISD::MUL);
986
987  computeRegisterProperties();
988
989  // Divide and reminder operations have no vector equivalent and can
990  // trap. Do a custom widening for these operations in which we never
991  // generate more divides/remainder than the original vector width.
992  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
993       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
994    if (!isTypeLegal((MVT::SimpleValueType)VT)) {
995      setOperationAction(ISD::SDIV, (MVT::SimpleValueType) VT, Custom);
996      setOperationAction(ISD::UDIV, (MVT::SimpleValueType) VT, Custom);
997      setOperationAction(ISD::SREM, (MVT::SimpleValueType) VT, Custom);
998      setOperationAction(ISD::UREM, (MVT::SimpleValueType) VT, Custom);
999    }
1000  }
1001
1002  // FIXME: These should be based on subtarget info. Plus, the values should
1003  // be smaller when we are in optimizing for size mode.
1004  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1005  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
1006  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
1007  setPrefLoopAlignment(16);
1008  benefitFromCodePlacementOpt = true;
1009}
1010
1011
1012MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1013  return MVT::i8;
1014}
1015
1016
1017/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1018/// the desired ByVal argument alignment.
1019static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
1020  if (MaxAlign == 16)
1021    return;
1022  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1023    if (VTy->getBitWidth() == 128)
1024      MaxAlign = 16;
1025  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1026    unsigned EltAlign = 0;
1027    getMaxByValAlign(ATy->getElementType(), EltAlign);
1028    if (EltAlign > MaxAlign)
1029      MaxAlign = EltAlign;
1030  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
1031    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1032      unsigned EltAlign = 0;
1033      getMaxByValAlign(STy->getElementType(i), EltAlign);
1034      if (EltAlign > MaxAlign)
1035        MaxAlign = EltAlign;
1036      if (MaxAlign == 16)
1037        break;
1038    }
1039  }
1040  return;
1041}
1042
1043/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1044/// function arguments in the caller parameter area. For X86, aggregates
1045/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1046/// are at 4-byte boundaries.
1047unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1048  if (Subtarget->is64Bit()) {
1049    // Max of 8 and alignment of type.
1050    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1051    if (TyAlign > 8)
1052      return TyAlign;
1053    return 8;
1054  }
1055
1056  unsigned Align = 4;
1057  if (Subtarget->hasSSE1())
1058    getMaxByValAlign(Ty, Align);
1059  return Align;
1060}
1061
1062/// getOptimalMemOpType - Returns the target specific optimal type for load
1063/// and store operations as a result of memset, memcpy, and memmove
1064/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
1065/// determining it.
1066EVT
1067X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
1068                                       bool isSrcConst, bool isSrcStr,
1069                                       SelectionDAG &DAG) const {
1070  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1071  // linux.  This is because the stack realignment code can't handle certain
1072  // cases like PR2962.  This should be removed when PR2962 is fixed.
1073  const Function *F = DAG.getMachineFunction().getFunction();
1074  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
1075  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
1076    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
1077      return MVT::v4i32;
1078    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
1079      return MVT::v4f32;
1080  }
1081  if (Subtarget->is64Bit() && Size >= 8)
1082    return MVT::i64;
1083  return MVT::i32;
1084}
1085
1086/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1087/// jumptable.
1088SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1089                                                      SelectionDAG &DAG) const {
1090  if (usesGlobalOffsetTable())
1091    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
1092  if (!Subtarget->is64Bit())
1093    // This doesn't have DebugLoc associated with it, but is not really the
1094    // same as a Register.
1095    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
1096                       getPointerTy());
1097  return Table;
1098}
1099
1100/// getFunctionAlignment - Return the Log2 alignment of this function.
1101unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1102  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
1103}
1104
1105//===----------------------------------------------------------------------===//
1106//               Return Value Calling Convention Implementation
1107//===----------------------------------------------------------------------===//
1108
1109#include "X86GenCallingConv.inc"
1110
1111bool
1112X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
1113                        const SmallVectorImpl<EVT> &OutTys,
1114                        const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
1115                        SelectionDAG &DAG) {
1116  SmallVector<CCValAssign, 16> RVLocs;
1117  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1118                 RVLocs, *DAG.getContext());
1119  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
1120}
1121
1122SDValue
1123X86TargetLowering::LowerReturn(SDValue Chain,
1124                               CallingConv::ID CallConv, bool isVarArg,
1125                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1126                               DebugLoc dl, SelectionDAG &DAG) {
1127
1128  SmallVector<CCValAssign, 16> RVLocs;
1129  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1130                 RVLocs, *DAG.getContext());
1131  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1132
1133  // If this is the first return lowered for this function, add the regs to the
1134  // liveout set for the function.
1135  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1136    for (unsigned i = 0; i != RVLocs.size(); ++i)
1137      if (RVLocs[i].isRegLoc())
1138        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1139  }
1140
1141  SDValue Flag;
1142
1143  SmallVector<SDValue, 6> RetOps;
1144  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1145  // Operand #1 = Bytes To Pop
1146  RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16));
1147
1148  // Copy the result values into the output registers.
1149  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1150    CCValAssign &VA = RVLocs[i];
1151    assert(VA.isRegLoc() && "Can only return in registers!");
1152    SDValue ValToCopy = Outs[i].Val;
1153
1154    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1155    // the RET instruction and handled by the FP Stackifier.
1156    if (VA.getLocReg() == X86::ST0 ||
1157        VA.getLocReg() == X86::ST1) {
1158      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1159      // change the value to the FP stack register class.
1160      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1161        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1162      RetOps.push_back(ValToCopy);
1163      // Don't emit a copytoreg.
1164      continue;
1165    }
1166
1167    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1168    // which is returned in RAX / RDX.
1169    if (Subtarget->is64Bit()) {
1170      EVT ValVT = ValToCopy.getValueType();
1171      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1172        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1173        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1174          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
1175      }
1176    }
1177
1178    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1179    Flag = Chain.getValue(1);
1180  }
1181
1182  // The x86-64 ABI for returning structs by value requires that we copy
1183  // the sret argument into %rax for the return. We saved the argument into
1184  // a virtual register in the entry block, so now we copy the value out
1185  // and into %rax.
1186  if (Subtarget->is64Bit() &&
1187      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1188    MachineFunction &MF = DAG.getMachineFunction();
1189    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1190    unsigned Reg = FuncInfo->getSRetReturnReg();
1191    if (!Reg) {
1192      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1193      FuncInfo->setSRetReturnReg(Reg);
1194    }
1195    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1196
1197    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1198    Flag = Chain.getValue(1);
1199
1200    // RAX now acts like a return value.
1201    MF.getRegInfo().addLiveOut(X86::RAX);
1202  }
1203
1204  RetOps[0] = Chain;  // Update chain.
1205
1206  // Add the flag if we have it.
1207  if (Flag.getNode())
1208    RetOps.push_back(Flag);
1209
1210  return DAG.getNode(X86ISD::RET_FLAG, dl,
1211                     MVT::Other, &RetOps[0], RetOps.size());
1212}
1213
1214/// LowerCallResult - Lower the result values of a call into the
1215/// appropriate copies out of appropriate physical registers.
1216///
1217SDValue
1218X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1219                                   CallingConv::ID CallConv, bool isVarArg,
1220                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1221                                   DebugLoc dl, SelectionDAG &DAG,
1222                                   SmallVectorImpl<SDValue> &InVals) {
1223
1224  // Assign locations to each value returned by this call.
1225  SmallVector<CCValAssign, 16> RVLocs;
1226  bool Is64Bit = Subtarget->is64Bit();
1227  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1228                 RVLocs, *DAG.getContext());
1229  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1230
1231  // Copy all of the result registers out of their specified physreg.
1232  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1233    CCValAssign &VA = RVLocs[i];
1234    EVT CopyVT = VA.getValVT();
1235
1236    // If this is x86-64, and we disabled SSE, we can't return FP values
1237    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1238        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1239      llvm_report_error("SSE register return with SSE disabled");
1240    }
1241
1242    // If this is a call to a function that returns an fp value on the floating
1243    // point stack, but where we prefer to use the value in xmm registers, copy
1244    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1245    if ((VA.getLocReg() == X86::ST0 ||
1246         VA.getLocReg() == X86::ST1) &&
1247        isScalarFPTypeInSSEReg(VA.getValVT())) {
1248      CopyVT = MVT::f80;
1249    }
1250
1251    SDValue Val;
1252    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1253      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1254      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1255        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1256                                   MVT::v2i64, InFlag).getValue(1);
1257        Val = Chain.getValue(0);
1258        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1259                          Val, DAG.getConstant(0, MVT::i64));
1260      } else {
1261        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1262                                   MVT::i64, InFlag).getValue(1);
1263        Val = Chain.getValue(0);
1264      }
1265      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1266    } else {
1267      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1268                                 CopyVT, InFlag).getValue(1);
1269      Val = Chain.getValue(0);
1270    }
1271    InFlag = Chain.getValue(2);
1272
1273    if (CopyVT != VA.getValVT()) {
1274      // Round the F80 the right size, which also moves to the appropriate xmm
1275      // register.
1276      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1277                        // This truncation won't change the value.
1278                        DAG.getIntPtrConstant(1));
1279    }
1280
1281    InVals.push_back(Val);
1282  }
1283
1284  return Chain;
1285}
1286
1287
1288//===----------------------------------------------------------------------===//
1289//                C & StdCall & Fast Calling Convention implementation
1290//===----------------------------------------------------------------------===//
1291//  StdCall calling convention seems to be standard for many Windows' API
1292//  routines and around. It differs from C calling convention just a little:
1293//  callee should clean up the stack, not caller. Symbols should be also
1294//  decorated in some fancy way :) It doesn't support any vector arguments.
1295//  For info on fast calling convention see Fast Calling Convention (tail call)
1296//  implementation LowerX86_32FastCCCallTo.
1297
1298/// CallIsStructReturn - Determines whether a call uses struct return
1299/// semantics.
1300static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1301  if (Outs.empty())
1302    return false;
1303
1304  return Outs[0].Flags.isSRet();
1305}
1306
1307/// ArgsAreStructReturn - Determines whether a function uses struct
1308/// return semantics.
1309static bool
1310ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1311  if (Ins.empty())
1312    return false;
1313
1314  return Ins[0].Flags.isSRet();
1315}
1316
1317/// IsCalleePop - Determines whether the callee is required to pop its
1318/// own arguments. Callee pop is necessary to support tail calls.
1319bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){
1320  if (IsVarArg)
1321    return false;
1322
1323  switch (CallingConv) {
1324  default:
1325    return false;
1326  case CallingConv::X86_StdCall:
1327    return !Subtarget->is64Bit();
1328  case CallingConv::X86_FastCall:
1329    return !Subtarget->is64Bit();
1330  case CallingConv::Fast:
1331    return PerformTailCallOpt;
1332  }
1333}
1334
1335/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1336/// given CallingConvention value.
1337CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1338  if (Subtarget->is64Bit()) {
1339    if (Subtarget->isTargetWin64())
1340      return CC_X86_Win64_C;
1341    else
1342      return CC_X86_64_C;
1343  }
1344
1345  if (CC == CallingConv::X86_FastCall)
1346    return CC_X86_32_FastCall;
1347  else if (CC == CallingConv::Fast)
1348    return CC_X86_32_FastCC;
1349  else
1350    return CC_X86_32_C;
1351}
1352
1353/// NameDecorationForCallConv - Selects the appropriate decoration to
1354/// apply to a MachineFunction containing a given calling convention.
1355NameDecorationStyle
1356X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) {
1357  if (CallConv == CallingConv::X86_FastCall)
1358    return FastCall;
1359  else if (CallConv == CallingConv::X86_StdCall)
1360    return StdCall;
1361  return None;
1362}
1363
1364
1365/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1366/// by "Src" to address "Dst" with size and alignment information specified by
1367/// the specific parameter attribute. The copy will be passed as a byval
1368/// function parameter.
1369static SDValue
1370CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1371                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1372                          DebugLoc dl) {
1373  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1374  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1375                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
1376}
1377
1378SDValue
1379X86TargetLowering::LowerMemArgument(SDValue Chain,
1380                                    CallingConv::ID CallConv,
1381                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1382                                    DebugLoc dl, SelectionDAG &DAG,
1383                                    const CCValAssign &VA,
1384                                    MachineFrameInfo *MFI,
1385                                    unsigned i) {
1386
1387  // Create the nodes corresponding to a load from this parameter slot.
1388  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1389  bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt;
1390  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1391  EVT ValVT;
1392
1393  // If value is passed by pointer we have address passed instead of the value
1394  // itself.
1395  if (VA.getLocInfo() == CCValAssign::Indirect)
1396    ValVT = VA.getLocVT();
1397  else
1398    ValVT = VA.getValVT();
1399
1400  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1401  // changed with more analysis.
1402  // In case of tail call optimization mark all arguments mutable. Since they
1403  // could be overwritten by lowering of arguments in case of a tail call.
1404  int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1405                                  VA.getLocMemOffset(), isImmutable, false);
1406  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1407  if (Flags.isByVal())
1408    return FIN;
1409  return DAG.getLoad(ValVT, dl, Chain, FIN,
1410                     PseudoSourceValue::getFixedStack(FI), 0);
1411}
1412
1413SDValue
1414X86TargetLowering::LowerFormalArguments(SDValue Chain,
1415                                        CallingConv::ID CallConv,
1416                                        bool isVarArg,
1417                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1418                                        DebugLoc dl,
1419                                        SelectionDAG &DAG,
1420                                        SmallVectorImpl<SDValue> &InVals) {
1421
1422  MachineFunction &MF = DAG.getMachineFunction();
1423  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1424
1425  const Function* Fn = MF.getFunction();
1426  if (Fn->hasExternalLinkage() &&
1427      Subtarget->isTargetCygMing() &&
1428      Fn->getName() == "main")
1429    FuncInfo->setForceFramePointer(true);
1430
1431  // Decorate the function name.
1432  FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv));
1433
1434  MachineFrameInfo *MFI = MF.getFrameInfo();
1435  bool Is64Bit = Subtarget->is64Bit();
1436  bool IsWin64 = Subtarget->isTargetWin64();
1437
1438  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1439         "Var args not supported with calling convention fastcc");
1440
1441  // Assign locations to all of the incoming arguments.
1442  SmallVector<CCValAssign, 16> ArgLocs;
1443  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1444                 ArgLocs, *DAG.getContext());
1445  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1446
1447  unsigned LastVal = ~0U;
1448  SDValue ArgValue;
1449  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1450    CCValAssign &VA = ArgLocs[i];
1451    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1452    // places.
1453    assert(VA.getValNo() != LastVal &&
1454           "Don't support value assigned to multiple locs yet");
1455    LastVal = VA.getValNo();
1456
1457    if (VA.isRegLoc()) {
1458      EVT RegVT = VA.getLocVT();
1459      TargetRegisterClass *RC = NULL;
1460      if (RegVT == MVT::i32)
1461        RC = X86::GR32RegisterClass;
1462      else if (Is64Bit && RegVT == MVT::i64)
1463        RC = X86::GR64RegisterClass;
1464      else if (RegVT == MVT::f32)
1465        RC = X86::FR32RegisterClass;
1466      else if (RegVT == MVT::f64)
1467        RC = X86::FR64RegisterClass;
1468      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1469        RC = X86::VR128RegisterClass;
1470      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1471        RC = X86::VR64RegisterClass;
1472      else
1473        llvm_unreachable("Unknown argument type!");
1474
1475      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1476      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1477
1478      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1479      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1480      // right size.
1481      if (VA.getLocInfo() == CCValAssign::SExt)
1482        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1483                               DAG.getValueType(VA.getValVT()));
1484      else if (VA.getLocInfo() == CCValAssign::ZExt)
1485        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1486                               DAG.getValueType(VA.getValVT()));
1487      else if (VA.getLocInfo() == CCValAssign::BCvt)
1488        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1489
1490      if (VA.isExtInLoc()) {
1491        // Handle MMX values passed in XMM regs.
1492        if (RegVT.isVector()) {
1493          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1494                                 ArgValue, DAG.getConstant(0, MVT::i64));
1495          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1496        } else
1497          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1498      }
1499    } else {
1500      assert(VA.isMemLoc());
1501      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1502    }
1503
1504    // If value is passed via pointer - do a load.
1505    if (VA.getLocInfo() == CCValAssign::Indirect)
1506      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0);
1507
1508    InVals.push_back(ArgValue);
1509  }
1510
1511  // The x86-64 ABI for returning structs by value requires that we copy
1512  // the sret argument into %rax for the return. Save the argument into
1513  // a virtual register so that we can access it from the return points.
1514  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1515    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1516    unsigned Reg = FuncInfo->getSRetReturnReg();
1517    if (!Reg) {
1518      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1519      FuncInfo->setSRetReturnReg(Reg);
1520    }
1521    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1522    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1523  }
1524
1525  unsigned StackSize = CCInfo.getNextStackOffset();
1526  // align stack specially for tail calls
1527  if (PerformTailCallOpt && CallConv == CallingConv::Fast)
1528    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1529
1530  // If the function takes variable number of arguments, make a frame index for
1531  // the start of the first vararg value... for expansion of llvm.va_start.
1532  if (isVarArg) {
1533    if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
1534      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false);
1535    }
1536    if (Is64Bit) {
1537      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1538
1539      // FIXME: We should really autogenerate these arrays
1540      static const unsigned GPR64ArgRegsWin64[] = {
1541        X86::RCX, X86::RDX, X86::R8,  X86::R9
1542      };
1543      static const unsigned XMMArgRegsWin64[] = {
1544        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1545      };
1546      static const unsigned GPR64ArgRegs64Bit[] = {
1547        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1548      };
1549      static const unsigned XMMArgRegs64Bit[] = {
1550        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1551        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1552      };
1553      const unsigned *GPR64ArgRegs, *XMMArgRegs;
1554
1555      if (IsWin64) {
1556        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1557        GPR64ArgRegs = GPR64ArgRegsWin64;
1558        XMMArgRegs = XMMArgRegsWin64;
1559      } else {
1560        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1561        GPR64ArgRegs = GPR64ArgRegs64Bit;
1562        XMMArgRegs = XMMArgRegs64Bit;
1563      }
1564      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1565                                                       TotalNumIntRegs);
1566      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1567                                                       TotalNumXMMRegs);
1568
1569      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1570      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1571             "SSE register cannot be used when SSE is disabled!");
1572      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1573             "SSE register cannot be used when SSE is disabled!");
1574      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1575        // Kernel mode asks for SSE to be disabled, so don't push them
1576        // on the stack.
1577        TotalNumXMMRegs = 0;
1578
1579      // For X86-64, if there are vararg parameters that are passed via
1580      // registers, then we must store them to their spots on the stack so they
1581      // may be loaded by deferencing the result of va_next.
1582      VarArgsGPOffset = NumIntRegs * 8;
1583      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1584      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1585                                                 TotalNumXMMRegs * 16, 16,
1586                                                 false);
1587
1588      // Store the integer parameter registers.
1589      SmallVector<SDValue, 8> MemOps;
1590      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1591      unsigned Offset = VarArgsGPOffset;
1592      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1593        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1594                                  DAG.getIntPtrConstant(Offset));
1595        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1596                                     X86::GR64RegisterClass);
1597        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1598        SDValue Store =
1599          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1600                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
1601                       Offset);
1602        MemOps.push_back(Store);
1603        Offset += 8;
1604      }
1605
1606      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1607        // Now store the XMM (fp + vector) parameter registers.
1608        SmallVector<SDValue, 11> SaveXMMOps;
1609        SaveXMMOps.push_back(Chain);
1610
1611        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1612        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1613        SaveXMMOps.push_back(ALVal);
1614
1615        SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex));
1616        SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset));
1617
1618        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1619          unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1620                                       X86::VR128RegisterClass);
1621          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1622          SaveXMMOps.push_back(Val);
1623        }
1624        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1625                                     MVT::Other,
1626                                     &SaveXMMOps[0], SaveXMMOps.size()));
1627      }
1628
1629      if (!MemOps.empty())
1630        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1631                            &MemOps[0], MemOps.size());
1632    }
1633  }
1634
1635  // Some CCs need callee pop.
1636  if (IsCalleePop(isVarArg, CallConv)) {
1637    BytesToPopOnReturn  = StackSize; // Callee pops everything.
1638    BytesCallerReserves = 0;
1639  } else {
1640    BytesToPopOnReturn  = 0; // Callee pops nothing.
1641    // If this is an sret function, the return should pop the hidden pointer.
1642    if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins))
1643      BytesToPopOnReturn = 4;
1644    BytesCallerReserves = StackSize;
1645  }
1646
1647  if (!Is64Bit) {
1648    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
1649    if (CallConv == CallingConv::X86_FastCall)
1650      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
1651  }
1652
1653  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1654
1655  return Chain;
1656}
1657
1658SDValue
1659X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1660                                    SDValue StackPtr, SDValue Arg,
1661                                    DebugLoc dl, SelectionDAG &DAG,
1662                                    const CCValAssign &VA,
1663                                    ISD::ArgFlagsTy Flags) {
1664  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1665  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1666  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1667  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1668  if (Flags.isByVal()) {
1669    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1670  }
1671  return DAG.getStore(Chain, dl, Arg, PtrOff,
1672                      PseudoSourceValue::getStack(), LocMemOffset);
1673}
1674
1675/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1676/// optimization is performed and it is required.
1677SDValue
1678X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1679                                           SDValue &OutRetAddr,
1680                                           SDValue Chain,
1681                                           bool IsTailCall,
1682                                           bool Is64Bit,
1683                                           int FPDiff,
1684                                           DebugLoc dl) {
1685  if (!IsTailCall || FPDiff==0) return Chain;
1686
1687  // Adjust the Return address stack slot.
1688  EVT VT = getPointerTy();
1689  OutRetAddr = getReturnAddressFrameIndex(DAG);
1690
1691  // Load the "old" Return address.
1692  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
1693  return SDValue(OutRetAddr.getNode(), 1);
1694}
1695
1696/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1697/// optimization is performed and it is required (FPDiff!=0).
1698static SDValue
1699EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1700                         SDValue Chain, SDValue RetAddrFrIdx,
1701                         bool Is64Bit, int FPDiff, DebugLoc dl) {
1702  // Store the return address to the appropriate stack slot.
1703  if (!FPDiff) return Chain;
1704  // Calculate the new stack slot for the return address.
1705  int SlotSize = Is64Bit ? 8 : 4;
1706  int NewReturnAddrFI =
1707    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize,
1708                                         true, false);
1709  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1710  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1711  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1712                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
1713  return Chain;
1714}
1715
1716SDValue
1717X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1718                             CallingConv::ID CallConv, bool isVarArg,
1719                             bool isTailCall,
1720                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1721                             const SmallVectorImpl<ISD::InputArg> &Ins,
1722                             DebugLoc dl, SelectionDAG &DAG,
1723                             SmallVectorImpl<SDValue> &InVals) {
1724
1725  MachineFunction &MF = DAG.getMachineFunction();
1726  bool Is64Bit        = Subtarget->is64Bit();
1727  bool IsStructRet    = CallIsStructReturn(Outs);
1728
1729  assert((!isTailCall ||
1730          (CallConv == CallingConv::Fast && PerformTailCallOpt)) &&
1731         "IsEligibleForTailCallOptimization missed a case!");
1732  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1733         "Var args not supported with calling convention fastcc");
1734
1735  // Analyze operands of the call, assigning locations to each operand.
1736  SmallVector<CCValAssign, 16> ArgLocs;
1737  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1738                 ArgLocs, *DAG.getContext());
1739  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1740
1741  // Get a count of how many bytes are to be pushed on the stack.
1742  unsigned NumBytes = CCInfo.getNextStackOffset();
1743  if (PerformTailCallOpt && CallConv == CallingConv::Fast)
1744    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1745
1746  int FPDiff = 0;
1747  if (isTailCall) {
1748    // Lower arguments at fp - stackoffset + fpdiff.
1749    unsigned NumBytesCallerPushed =
1750      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1751    FPDiff = NumBytesCallerPushed - NumBytes;
1752
1753    // Set the delta of movement of the returnaddr stackslot.
1754    // But only set if delta is greater than previous delta.
1755    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1756      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1757  }
1758
1759  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1760
1761  SDValue RetAddrFrIdx;
1762  // Load return adress for tail calls.
1763  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit,
1764                                  FPDiff, dl);
1765
1766  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1767  SmallVector<SDValue, 8> MemOpChains;
1768  SDValue StackPtr;
1769
1770  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1771  // of tail call optimization arguments are handle later.
1772  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1773    CCValAssign &VA = ArgLocs[i];
1774    EVT RegVT = VA.getLocVT();
1775    SDValue Arg = Outs[i].Val;
1776    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1777    bool isByVal = Flags.isByVal();
1778
1779    // Promote the value if needed.
1780    switch (VA.getLocInfo()) {
1781    default: llvm_unreachable("Unknown loc info!");
1782    case CCValAssign::Full: break;
1783    case CCValAssign::SExt:
1784      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1785      break;
1786    case CCValAssign::ZExt:
1787      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1788      break;
1789    case CCValAssign::AExt:
1790      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1791        // Special case: passing MMX values in XMM registers.
1792        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1793        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1794        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1795      } else
1796        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1797      break;
1798    case CCValAssign::BCvt:
1799      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1800      break;
1801    case CCValAssign::Indirect: {
1802      // Store the argument.
1803      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1804      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1805      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1806                           PseudoSourceValue::getFixedStack(FI), 0);
1807      Arg = SpillSlot;
1808      break;
1809    }
1810    }
1811
1812    if (VA.isRegLoc()) {
1813      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1814    } else {
1815      if (!isTailCall || (isTailCall && isByVal)) {
1816        assert(VA.isMemLoc());
1817        if (StackPtr.getNode() == 0)
1818          StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1819
1820        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1821                                               dl, DAG, VA, Flags));
1822      }
1823    }
1824  }
1825
1826  if (!MemOpChains.empty())
1827    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1828                        &MemOpChains[0], MemOpChains.size());
1829
1830  // Build a sequence of copy-to-reg nodes chained together with token chain
1831  // and flag operands which copy the outgoing args into registers.
1832  SDValue InFlag;
1833  // Tail call byval lowering might overwrite argument registers so in case of
1834  // tail call optimization the copies to registers are lowered later.
1835  if (!isTailCall)
1836    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1837      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1838                               RegsToPass[i].second, InFlag);
1839      InFlag = Chain.getValue(1);
1840    }
1841
1842
1843  if (Subtarget->isPICStyleGOT()) {
1844    // ELF / PIC requires GOT in the EBX register before function calls via PLT
1845    // GOT pointer.
1846    if (!isTailCall) {
1847      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1848                               DAG.getNode(X86ISD::GlobalBaseReg,
1849                                           DebugLoc::getUnknownLoc(),
1850                                           getPointerTy()),
1851                               InFlag);
1852      InFlag = Chain.getValue(1);
1853    } else {
1854      // If we are tail calling and generating PIC/GOT style code load the
1855      // address of the callee into ECX. The value in ecx is used as target of
1856      // the tail jump. This is done to circumvent the ebx/callee-saved problem
1857      // for tail calls on PIC/GOT architectures. Normally we would just put the
1858      // address of GOT into ebx and then call target@PLT. But for tail calls
1859      // ebx would be restored (since ebx is callee saved) before jumping to the
1860      // target@PLT.
1861
1862      // Note: The actual moving to ECX is done further down.
1863      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1864      if (G && !G->getGlobal()->hasHiddenVisibility() &&
1865          !G->getGlobal()->hasProtectedVisibility())
1866        Callee = LowerGlobalAddress(Callee, DAG);
1867      else if (isa<ExternalSymbolSDNode>(Callee))
1868        Callee = LowerExternalSymbol(Callee, DAG);
1869    }
1870  }
1871
1872  if (Is64Bit && isVarArg) {
1873    // From AMD64 ABI document:
1874    // For calls that may call functions that use varargs or stdargs
1875    // (prototype-less calls or calls to functions containing ellipsis (...) in
1876    // the declaration) %al is used as hidden argument to specify the number
1877    // of SSE registers used. The contents of %al do not need to match exactly
1878    // the number of registers, but must be an ubound on the number of SSE
1879    // registers used and is in the range 0 - 8 inclusive.
1880
1881    // FIXME: Verify this on Win64
1882    // Count the number of XMM registers allocated.
1883    static const unsigned XMMArgRegs[] = {
1884      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1885      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1886    };
1887    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1888    assert((Subtarget->hasSSE1() || !NumXMMRegs)
1889           && "SSE registers cannot be used when SSE is disabled");
1890
1891    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1892                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1893    InFlag = Chain.getValue(1);
1894  }
1895
1896
1897  // For tail calls lower the arguments to the 'real' stack slot.
1898  if (isTailCall) {
1899    // Force all the incoming stack arguments to be loaded from the stack
1900    // before any new outgoing arguments are stored to the stack, because the
1901    // outgoing stack slots may alias the incoming argument stack slots, and
1902    // the alias isn't otherwise explicit. This is slightly more conservative
1903    // than necessary, because it means that each store effectively depends
1904    // on every argument instead of just those arguments it would clobber.
1905    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
1906
1907    SmallVector<SDValue, 8> MemOpChains2;
1908    SDValue FIN;
1909    int FI = 0;
1910    // Do not flag preceeding copytoreg stuff together with the following stuff.
1911    InFlag = SDValue();
1912    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1913      CCValAssign &VA = ArgLocs[i];
1914      if (!VA.isRegLoc()) {
1915        assert(VA.isMemLoc());
1916        SDValue Arg = Outs[i].Val;
1917        ISD::ArgFlagsTy Flags = Outs[i].Flags;
1918        // Create frame index.
1919        int32_t Offset = VA.getLocMemOffset()+FPDiff;
1920        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
1921        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
1922        FIN = DAG.getFrameIndex(FI, getPointerTy());
1923
1924        if (Flags.isByVal()) {
1925          // Copy relative to framepointer.
1926          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
1927          if (StackPtr.getNode() == 0)
1928            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
1929                                          getPointerTy());
1930          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
1931
1932          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
1933                                                           ArgChain,
1934                                                           Flags, DAG, dl));
1935        } else {
1936          // Store relative to framepointer.
1937          MemOpChains2.push_back(
1938            DAG.getStore(ArgChain, dl, Arg, FIN,
1939                         PseudoSourceValue::getFixedStack(FI), 0));
1940        }
1941      }
1942    }
1943
1944    if (!MemOpChains2.empty())
1945      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1946                          &MemOpChains2[0], MemOpChains2.size());
1947
1948    // Copy arguments to their registers.
1949    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1950      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1951                               RegsToPass[i].second, InFlag);
1952      InFlag = Chain.getValue(1);
1953    }
1954    InFlag =SDValue();
1955
1956    // Store the return address to the appropriate stack slot.
1957    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
1958                                     FPDiff, dl);
1959  }
1960
1961  bool WasGlobalOrExternal = false;
1962  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
1963    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
1964    // In the 64-bit large code model, we have to make all calls
1965    // through a register, since the call instruction's 32-bit
1966    // pc-relative offset may not be large enough to hold the whole
1967    // address.
1968  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1969    WasGlobalOrExternal = true;
1970    // If the callee is a GlobalAddress node (quite common, every direct call
1971    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
1972    // it.
1973
1974    // We should use extra load for direct calls to dllimported functions in
1975    // non-JIT mode.
1976    GlobalValue *GV = G->getGlobal();
1977    if (!GV->hasDLLImportLinkage()) {
1978      unsigned char OpFlags = 0;
1979
1980      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
1981      // external symbols most go through the PLT in PIC mode.  If the symbol
1982      // has hidden or protected visibility, or if it is static or local, then
1983      // we don't need to use the PLT - we can directly call it.
1984      if (Subtarget->isTargetELF() &&
1985          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1986          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
1987        OpFlags = X86II::MO_PLT;
1988      } else if (Subtarget->isPICStyleStubAny() &&
1989               (GV->isDeclaration() || GV->isWeakForLinker()) &&
1990               Subtarget->getDarwinVers() < 9) {
1991        // PC-relative references to external symbols should go through $stub,
1992        // unless we're building with the leopard linker or later, which
1993        // automatically synthesizes these stubs.
1994        OpFlags = X86II::MO_DARWIN_STUB;
1995      }
1996
1997      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
1998                                          G->getOffset(), OpFlags);
1999    }
2000  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2001    WasGlobalOrExternal = true;
2002    unsigned char OpFlags = 0;
2003
2004    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
2005    // symbols should go through the PLT.
2006    if (Subtarget->isTargetELF() &&
2007        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2008      OpFlags = X86II::MO_PLT;
2009    } else if (Subtarget->isPICStyleStubAny() &&
2010             Subtarget->getDarwinVers() < 9) {
2011      // PC-relative references to external symbols should go through $stub,
2012      // unless we're building with the leopard linker or later, which
2013      // automatically synthesizes these stubs.
2014      OpFlags = X86II::MO_DARWIN_STUB;
2015    }
2016
2017    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2018                                         OpFlags);
2019  }
2020
2021  if (isTailCall && !WasGlobalOrExternal) {
2022    unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
2023
2024    Chain = DAG.getCopyToReg(Chain,  dl,
2025                             DAG.getRegister(Opc, getPointerTy()),
2026                             Callee,InFlag);
2027    Callee = DAG.getRegister(Opc, getPointerTy());
2028    // Add register as live out.
2029    MF.getRegInfo().addLiveOut(Opc);
2030  }
2031
2032  // Returns a chain & a flag for retval copy to use.
2033  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
2034  SmallVector<SDValue, 8> Ops;
2035
2036  if (isTailCall) {
2037    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2038                           DAG.getIntPtrConstant(0, true), InFlag);
2039    InFlag = Chain.getValue(1);
2040  }
2041
2042  Ops.push_back(Chain);
2043  Ops.push_back(Callee);
2044
2045  if (isTailCall)
2046    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2047
2048  // Add argument registers to the end of the list so that they are known live
2049  // into the call.
2050  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2051    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2052                                  RegsToPass[i].second.getValueType()));
2053
2054  // Add an implicit use GOT pointer in EBX.
2055  if (!isTailCall && Subtarget->isPICStyleGOT())
2056    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2057
2058  // Add an implicit use of AL for x86 vararg functions.
2059  if (Is64Bit && isVarArg)
2060    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2061
2062  if (InFlag.getNode())
2063    Ops.push_back(InFlag);
2064
2065  if (isTailCall) {
2066    // If this is the first return lowered for this function, add the regs
2067    // to the liveout set for the function.
2068    if (MF.getRegInfo().liveout_empty()) {
2069      SmallVector<CCValAssign, 16> RVLocs;
2070      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
2071                     *DAG.getContext());
2072      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2073      for (unsigned i = 0; i != RVLocs.size(); ++i)
2074        if (RVLocs[i].isRegLoc())
2075          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
2076    }
2077
2078    assert(((Callee.getOpcode() == ISD::Register &&
2079               (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
2080                cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) ||
2081              Callee.getOpcode() == ISD::TargetExternalSymbol ||
2082              Callee.getOpcode() == ISD::TargetGlobalAddress) &&
2083             "Expecting an global address, external symbol, or register");
2084
2085    return DAG.getNode(X86ISD::TC_RETURN, dl,
2086                       NodeTys, &Ops[0], Ops.size());
2087  }
2088
2089  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2090  InFlag = Chain.getValue(1);
2091
2092  // Create the CALLSEQ_END node.
2093  unsigned NumBytesForCalleeToPush;
2094  if (IsCalleePop(isVarArg, CallConv))
2095    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2096  else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
2097    // If this is is a call to a struct-return function, the callee
2098    // pops the hidden struct pointer, so we have to push it back.
2099    // This is common for Darwin/X86, Linux & Mingw32 targets.
2100    NumBytesForCalleeToPush = 4;
2101  else
2102    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2103
2104  // Returns a flag for retval copy to use.
2105  Chain = DAG.getCALLSEQ_END(Chain,
2106                             DAG.getIntPtrConstant(NumBytes, true),
2107                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2108                                                   true),
2109                             InFlag);
2110  InFlag = Chain.getValue(1);
2111
2112  // Handle result values, copying them out of physregs into vregs that we
2113  // return.
2114  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2115                         Ins, dl, DAG, InVals);
2116}
2117
2118
2119//===----------------------------------------------------------------------===//
2120//                Fast Calling Convention (tail call) implementation
2121//===----------------------------------------------------------------------===//
2122
2123//  Like std call, callee cleans arguments, convention except that ECX is
2124//  reserved for storing the tail called function address. Only 2 registers are
2125//  free for argument passing (inreg). Tail call optimization is performed
2126//  provided:
2127//                * tailcallopt is enabled
2128//                * caller/callee are fastcc
2129//  On X86_64 architecture with GOT-style position independent code only local
2130//  (within module) calls are supported at the moment.
2131//  To keep the stack aligned according to platform abi the function
2132//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2133//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2134//  If a tail called function callee has more arguments than the caller the
2135//  caller needs to make sure that there is room to move the RETADDR to. This is
2136//  achieved by reserving an area the size of the argument delta right after the
2137//  original REtADDR, but before the saved framepointer or the spilled registers
2138//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2139//  stack layout:
2140//    arg1
2141//    arg2
2142//    RETADDR
2143//    [ new RETADDR
2144//      move area ]
2145//    (possible EBP)
2146//    ESI
2147//    EDI
2148//    local1 ..
2149
2150/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2151/// for a 16 byte align requirement.
2152unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2153                                                        SelectionDAG& DAG) {
2154  MachineFunction &MF = DAG.getMachineFunction();
2155  const TargetMachine &TM = MF.getTarget();
2156  const TargetFrameInfo &TFI = *TM.getFrameInfo();
2157  unsigned StackAlignment = TFI.getStackAlignment();
2158  uint64_t AlignMask = StackAlignment - 1;
2159  int64_t Offset = StackSize;
2160  uint64_t SlotSize = TD->getPointerSize();
2161  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2162    // Number smaller than 12 so just add the difference.
2163    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2164  } else {
2165    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2166    Offset = ((~AlignMask) & Offset) + StackAlignment +
2167      (StackAlignment-SlotSize);
2168  }
2169  return Offset;
2170}
2171
2172/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2173/// for tail call optimization. Targets which want to do tail call
2174/// optimization should implement this function.
2175bool
2176X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2177                                                     CallingConv::ID CalleeCC,
2178                                                     bool isVarArg,
2179                                      const SmallVectorImpl<ISD::InputArg> &Ins,
2180                                                     SelectionDAG& DAG) const {
2181  MachineFunction &MF = DAG.getMachineFunction();
2182  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
2183  return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC;
2184}
2185
2186FastISel *
2187X86TargetLowering::createFastISel(MachineFunction &mf,
2188                                  MachineModuleInfo *mmo,
2189                                  DwarfWriter *dw,
2190                                  DenseMap<const Value *, unsigned> &vm,
2191                                  DenseMap<const BasicBlock *,
2192                                           MachineBasicBlock *> &bm,
2193                                  DenseMap<const AllocaInst *, int> &am
2194#ifndef NDEBUG
2195                                  , SmallSet<Instruction*, 8> &cil
2196#endif
2197                                  ) {
2198  return X86::createFastISel(mf, mmo, dw, vm, bm, am
2199#ifndef NDEBUG
2200                             , cil
2201#endif
2202                             );
2203}
2204
2205
2206//===----------------------------------------------------------------------===//
2207//                           Other Lowering Hooks
2208//===----------------------------------------------------------------------===//
2209
2210
2211SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
2212  MachineFunction &MF = DAG.getMachineFunction();
2213  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2214  int ReturnAddrIndex = FuncInfo->getRAIndex();
2215
2216  if (ReturnAddrIndex == 0) {
2217    // Set up a frame object for the return address.
2218    uint64_t SlotSize = TD->getPointerSize();
2219    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2220                                                           true, false);
2221    FuncInfo->setRAIndex(ReturnAddrIndex);
2222  }
2223
2224  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2225}
2226
2227
2228bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2229                                       bool hasSymbolicDisplacement) {
2230  // Offset should fit into 32 bit immediate field.
2231  if (!isInt32(Offset))
2232    return false;
2233
2234  // If we don't have a symbolic displacement - we don't have any extra
2235  // restrictions.
2236  if (!hasSymbolicDisplacement)
2237    return true;
2238
2239  // FIXME: Some tweaks might be needed for medium code model.
2240  if (M != CodeModel::Small && M != CodeModel::Kernel)
2241    return false;
2242
2243  // For small code model we assume that latest object is 16MB before end of 31
2244  // bits boundary. We may also accept pretty large negative constants knowing
2245  // that all objects are in the positive half of address space.
2246  if (M == CodeModel::Small && Offset < 16*1024*1024)
2247    return true;
2248
2249  // For kernel code model we know that all object resist in the negative half
2250  // of 32bits address space. We may not accept negative offsets, since they may
2251  // be just off and we may accept pretty large positive ones.
2252  if (M == CodeModel::Kernel && Offset > 0)
2253    return true;
2254
2255  return false;
2256}
2257
2258/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2259/// specific condition code, returning the condition code and the LHS/RHS of the
2260/// comparison to make.
2261static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2262                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2263  if (!isFP) {
2264    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2265      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2266        // X > -1   -> X == 0, jump !sign.
2267        RHS = DAG.getConstant(0, RHS.getValueType());
2268        return X86::COND_NS;
2269      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2270        // X < 0   -> X == 0, jump on sign.
2271        return X86::COND_S;
2272      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2273        // X < 1   -> X <= 0
2274        RHS = DAG.getConstant(0, RHS.getValueType());
2275        return X86::COND_LE;
2276      }
2277    }
2278
2279    switch (SetCCOpcode) {
2280    default: llvm_unreachable("Invalid integer condition!");
2281    case ISD::SETEQ:  return X86::COND_E;
2282    case ISD::SETGT:  return X86::COND_G;
2283    case ISD::SETGE:  return X86::COND_GE;
2284    case ISD::SETLT:  return X86::COND_L;
2285    case ISD::SETLE:  return X86::COND_LE;
2286    case ISD::SETNE:  return X86::COND_NE;
2287    case ISD::SETULT: return X86::COND_B;
2288    case ISD::SETUGT: return X86::COND_A;
2289    case ISD::SETULE: return X86::COND_BE;
2290    case ISD::SETUGE: return X86::COND_AE;
2291    }
2292  }
2293
2294  // First determine if it is required or is profitable to flip the operands.
2295
2296  // If LHS is a foldable load, but RHS is not, flip the condition.
2297  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2298      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2299    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2300    std::swap(LHS, RHS);
2301  }
2302
2303  switch (SetCCOpcode) {
2304  default: break;
2305  case ISD::SETOLT:
2306  case ISD::SETOLE:
2307  case ISD::SETUGT:
2308  case ISD::SETUGE:
2309    std::swap(LHS, RHS);
2310    break;
2311  }
2312
2313  // On a floating point condition, the flags are set as follows:
2314  // ZF  PF  CF   op
2315  //  0 | 0 | 0 | X > Y
2316  //  0 | 0 | 1 | X < Y
2317  //  1 | 0 | 0 | X == Y
2318  //  1 | 1 | 1 | unordered
2319  switch (SetCCOpcode) {
2320  default: llvm_unreachable("Condcode should be pre-legalized away");
2321  case ISD::SETUEQ:
2322  case ISD::SETEQ:   return X86::COND_E;
2323  case ISD::SETOLT:              // flipped
2324  case ISD::SETOGT:
2325  case ISD::SETGT:   return X86::COND_A;
2326  case ISD::SETOLE:              // flipped
2327  case ISD::SETOGE:
2328  case ISD::SETGE:   return X86::COND_AE;
2329  case ISD::SETUGT:              // flipped
2330  case ISD::SETULT:
2331  case ISD::SETLT:   return X86::COND_B;
2332  case ISD::SETUGE:              // flipped
2333  case ISD::SETULE:
2334  case ISD::SETLE:   return X86::COND_BE;
2335  case ISD::SETONE:
2336  case ISD::SETNE:   return X86::COND_NE;
2337  case ISD::SETUO:   return X86::COND_P;
2338  case ISD::SETO:    return X86::COND_NP;
2339  case ISD::SETOEQ:
2340  case ISD::SETUNE:  return X86::COND_INVALID;
2341  }
2342}
2343
2344/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2345/// code. Current x86 isa includes the following FP cmov instructions:
2346/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2347static bool hasFPCMov(unsigned X86CC) {
2348  switch (X86CC) {
2349  default:
2350    return false;
2351  case X86::COND_B:
2352  case X86::COND_BE:
2353  case X86::COND_E:
2354  case X86::COND_P:
2355  case X86::COND_A:
2356  case X86::COND_AE:
2357  case X86::COND_NE:
2358  case X86::COND_NP:
2359    return true;
2360  }
2361}
2362
2363/// isFPImmLegal - Returns true if the target can instruction select the
2364/// specified FP immediate natively. If false, the legalizer will
2365/// materialize the FP immediate as a load from a constant pool.
2366bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
2367  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
2368    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
2369      return true;
2370  }
2371  return false;
2372}
2373
2374/// isUndefOrInRange - Return true if Val is undef or if its value falls within
2375/// the specified range (L, H].
2376static bool isUndefOrInRange(int Val, int Low, int Hi) {
2377  return (Val < 0) || (Val >= Low && Val < Hi);
2378}
2379
2380/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2381/// specified value.
2382static bool isUndefOrEqual(int Val, int CmpVal) {
2383  if (Val < 0 || Val == CmpVal)
2384    return true;
2385  return false;
2386}
2387
2388/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2389/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
2390/// the second operand.
2391static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2392  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2393    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2394  if (VT == MVT::v2f64 || VT == MVT::v2i64)
2395    return (Mask[0] < 2 && Mask[1] < 2);
2396  return false;
2397}
2398
2399bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2400  SmallVector<int, 8> M;
2401  N->getMask(M);
2402  return ::isPSHUFDMask(M, N->getValueType(0));
2403}
2404
2405/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2406/// is suitable for input to PSHUFHW.
2407static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2408  if (VT != MVT::v8i16)
2409    return false;
2410
2411  // Lower quadword copied in order or undef.
2412  for (int i = 0; i != 4; ++i)
2413    if (Mask[i] >= 0 && Mask[i] != i)
2414      return false;
2415
2416  // Upper quadword shuffled.
2417  for (int i = 4; i != 8; ++i)
2418    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2419      return false;
2420
2421  return true;
2422}
2423
2424bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2425  SmallVector<int, 8> M;
2426  N->getMask(M);
2427  return ::isPSHUFHWMask(M, N->getValueType(0));
2428}
2429
2430/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2431/// is suitable for input to PSHUFLW.
2432static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2433  if (VT != MVT::v8i16)
2434    return false;
2435
2436  // Upper quadword copied in order.
2437  for (int i = 4; i != 8; ++i)
2438    if (Mask[i] >= 0 && Mask[i] != i)
2439      return false;
2440
2441  // Lower quadword shuffled.
2442  for (int i = 0; i != 4; ++i)
2443    if (Mask[i] >= 4)
2444      return false;
2445
2446  return true;
2447}
2448
2449bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2450  SmallVector<int, 8> M;
2451  N->getMask(M);
2452  return ::isPSHUFLWMask(M, N->getValueType(0));
2453}
2454
2455/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
2456/// is suitable for input to PALIGNR.
2457static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
2458                          bool hasSSSE3) {
2459  int i, e = VT.getVectorNumElements();
2460
2461  // Do not handle v2i64 / v2f64 shuffles with palignr.
2462  if (e < 4 || !hasSSSE3)
2463    return false;
2464
2465  for (i = 0; i != e; ++i)
2466    if (Mask[i] >= 0)
2467      break;
2468
2469  // All undef, not a palignr.
2470  if (i == e)
2471    return false;
2472
2473  // Determine if it's ok to perform a palignr with only the LHS, since we
2474  // don't have access to the actual shuffle elements to see if RHS is undef.
2475  bool Unary = Mask[i] < (int)e;
2476  bool NeedsUnary = false;
2477
2478  int s = Mask[i] - i;
2479
2480  // Check the rest of the elements to see if they are consecutive.
2481  for (++i; i != e; ++i) {
2482    int m = Mask[i];
2483    if (m < 0)
2484      continue;
2485
2486    Unary = Unary && (m < (int)e);
2487    NeedsUnary = NeedsUnary || (m < s);
2488
2489    if (NeedsUnary && !Unary)
2490      return false;
2491    if (Unary && m != ((s+i) & (e-1)))
2492      return false;
2493    if (!Unary && m != (s+i))
2494      return false;
2495  }
2496  return true;
2497}
2498
2499bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
2500  SmallVector<int, 8> M;
2501  N->getMask(M);
2502  return ::isPALIGNRMask(M, N->getValueType(0), true);
2503}
2504
2505/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2506/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2507static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2508  int NumElems = VT.getVectorNumElements();
2509  if (NumElems != 2 && NumElems != 4)
2510    return false;
2511
2512  int Half = NumElems / 2;
2513  for (int i = 0; i < Half; ++i)
2514    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2515      return false;
2516  for (int i = Half; i < NumElems; ++i)
2517    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2518      return false;
2519
2520  return true;
2521}
2522
2523bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2524  SmallVector<int, 8> M;
2525  N->getMask(M);
2526  return ::isSHUFPMask(M, N->getValueType(0));
2527}
2528
2529/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2530/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2531/// half elements to come from vector 1 (which would equal the dest.) and
2532/// the upper half to come from vector 2.
2533static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2534  int NumElems = VT.getVectorNumElements();
2535
2536  if (NumElems != 2 && NumElems != 4)
2537    return false;
2538
2539  int Half = NumElems / 2;
2540  for (int i = 0; i < Half; ++i)
2541    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2542      return false;
2543  for (int i = Half; i < NumElems; ++i)
2544    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2545      return false;
2546  return true;
2547}
2548
2549static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2550  SmallVector<int, 8> M;
2551  N->getMask(M);
2552  return isCommutedSHUFPMask(M, N->getValueType(0));
2553}
2554
2555/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2556/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2557bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2558  if (N->getValueType(0).getVectorNumElements() != 4)
2559    return false;
2560
2561  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2562  return isUndefOrEqual(N->getMaskElt(0), 6) &&
2563         isUndefOrEqual(N->getMaskElt(1), 7) &&
2564         isUndefOrEqual(N->getMaskElt(2), 2) &&
2565         isUndefOrEqual(N->getMaskElt(3), 3);
2566}
2567
2568/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2569/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2570/// <2, 3, 2, 3>
2571bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2572  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2573
2574  if (NumElems != 4)
2575    return false;
2576
2577  return isUndefOrEqual(N->getMaskElt(0), 2) &&
2578  isUndefOrEqual(N->getMaskElt(1), 3) &&
2579  isUndefOrEqual(N->getMaskElt(2), 2) &&
2580  isUndefOrEqual(N->getMaskElt(3), 3);
2581}
2582
2583/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2584/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2585bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2586  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2587
2588  if (NumElems != 2 && NumElems != 4)
2589    return false;
2590
2591  for (unsigned i = 0; i < NumElems/2; ++i)
2592    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2593      return false;
2594
2595  for (unsigned i = NumElems/2; i < NumElems; ++i)
2596    if (!isUndefOrEqual(N->getMaskElt(i), i))
2597      return false;
2598
2599  return true;
2600}
2601
2602/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
2603/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
2604bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
2605  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2606
2607  if (NumElems != 2 && NumElems != 4)
2608    return false;
2609
2610  for (unsigned i = 0; i < NumElems/2; ++i)
2611    if (!isUndefOrEqual(N->getMaskElt(i), i))
2612      return false;
2613
2614  for (unsigned i = 0; i < NumElems/2; ++i)
2615    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2616      return false;
2617
2618  return true;
2619}
2620
2621/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2622/// specifies a shuffle of elements that is suitable for input to UNPCKL.
2623static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2624                         bool V2IsSplat = false) {
2625  int NumElts = VT.getVectorNumElements();
2626  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2627    return false;
2628
2629  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2630    int BitI  = Mask[i];
2631    int BitI1 = Mask[i+1];
2632    if (!isUndefOrEqual(BitI, j))
2633      return false;
2634    if (V2IsSplat) {
2635      if (!isUndefOrEqual(BitI1, NumElts))
2636        return false;
2637    } else {
2638      if (!isUndefOrEqual(BitI1, j + NumElts))
2639        return false;
2640    }
2641  }
2642  return true;
2643}
2644
2645bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2646  SmallVector<int, 8> M;
2647  N->getMask(M);
2648  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2649}
2650
2651/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2652/// specifies a shuffle of elements that is suitable for input to UNPCKH.
2653static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
2654                         bool V2IsSplat = false) {
2655  int NumElts = VT.getVectorNumElements();
2656  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2657    return false;
2658
2659  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2660    int BitI  = Mask[i];
2661    int BitI1 = Mask[i+1];
2662    if (!isUndefOrEqual(BitI, j + NumElts/2))
2663      return false;
2664    if (V2IsSplat) {
2665      if (isUndefOrEqual(BitI1, NumElts))
2666        return false;
2667    } else {
2668      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2669        return false;
2670    }
2671  }
2672  return true;
2673}
2674
2675bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2676  SmallVector<int, 8> M;
2677  N->getMask(M);
2678  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2679}
2680
2681/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2682/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2683/// <0, 0, 1, 1>
2684static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2685  int NumElems = VT.getVectorNumElements();
2686  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2687    return false;
2688
2689  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2690    int BitI  = Mask[i];
2691    int BitI1 = Mask[i+1];
2692    if (!isUndefOrEqual(BitI, j))
2693      return false;
2694    if (!isUndefOrEqual(BitI1, j))
2695      return false;
2696  }
2697  return true;
2698}
2699
2700bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2701  SmallVector<int, 8> M;
2702  N->getMask(M);
2703  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2704}
2705
2706/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2707/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2708/// <2, 2, 3, 3>
2709static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2710  int NumElems = VT.getVectorNumElements();
2711  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2712    return false;
2713
2714  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2715    int BitI  = Mask[i];
2716    int BitI1 = Mask[i+1];
2717    if (!isUndefOrEqual(BitI, j))
2718      return false;
2719    if (!isUndefOrEqual(BitI1, j))
2720      return false;
2721  }
2722  return true;
2723}
2724
2725bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
2726  SmallVector<int, 8> M;
2727  N->getMask(M);
2728  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
2729}
2730
2731/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2732/// specifies a shuffle of elements that is suitable for input to MOVSS,
2733/// MOVSD, and MOVD, i.e. setting the lowest element.
2734static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2735  if (VT.getVectorElementType().getSizeInBits() < 32)
2736    return false;
2737
2738  int NumElts = VT.getVectorNumElements();
2739
2740  if (!isUndefOrEqual(Mask[0], NumElts))
2741    return false;
2742
2743  for (int i = 1; i < NumElts; ++i)
2744    if (!isUndefOrEqual(Mask[i], i))
2745      return false;
2746
2747  return true;
2748}
2749
2750bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
2751  SmallVector<int, 8> M;
2752  N->getMask(M);
2753  return ::isMOVLMask(M, N->getValueType(0));
2754}
2755
2756/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2757/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
2758/// element of vector 2 and the other elements to come from vector 1 in order.
2759static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2760                               bool V2IsSplat = false, bool V2IsUndef = false) {
2761  int NumOps = VT.getVectorNumElements();
2762  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2763    return false;
2764
2765  if (!isUndefOrEqual(Mask[0], 0))
2766    return false;
2767
2768  for (int i = 1; i < NumOps; ++i)
2769    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
2770          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
2771          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
2772      return false;
2773
2774  return true;
2775}
2776
2777static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
2778                           bool V2IsUndef = false) {
2779  SmallVector<int, 8> M;
2780  N->getMask(M);
2781  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
2782}
2783
2784/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2785/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
2786bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
2787  if (N->getValueType(0).getVectorNumElements() != 4)
2788    return false;
2789
2790  // Expect 1, 1, 3, 3
2791  for (unsigned i = 0; i < 2; ++i) {
2792    int Elt = N->getMaskElt(i);
2793    if (Elt >= 0 && Elt != 1)
2794      return false;
2795  }
2796
2797  bool HasHi = false;
2798  for (unsigned i = 2; i < 4; ++i) {
2799    int Elt = N->getMaskElt(i);
2800    if (Elt >= 0 && Elt != 3)
2801      return false;
2802    if (Elt == 3)
2803      HasHi = true;
2804  }
2805  // Don't use movshdup if it can be done with a shufps.
2806  // FIXME: verify that matching u, u, 3, 3 is what we want.
2807  return HasHi;
2808}
2809
2810/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2811/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
2812bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
2813  if (N->getValueType(0).getVectorNumElements() != 4)
2814    return false;
2815
2816  // Expect 0, 0, 2, 2
2817  for (unsigned i = 0; i < 2; ++i)
2818    if (N->getMaskElt(i) > 0)
2819      return false;
2820
2821  bool HasHi = false;
2822  for (unsigned i = 2; i < 4; ++i) {
2823    int Elt = N->getMaskElt(i);
2824    if (Elt >= 0 && Elt != 2)
2825      return false;
2826    if (Elt == 2)
2827      HasHi = true;
2828  }
2829  // Don't use movsldup if it can be done with a shufps.
2830  return HasHi;
2831}
2832
2833/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2834/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
2835bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
2836  int e = N->getValueType(0).getVectorNumElements() / 2;
2837
2838  for (int i = 0; i < e; ++i)
2839    if (!isUndefOrEqual(N->getMaskElt(i), i))
2840      return false;
2841  for (int i = 0; i < e; ++i)
2842    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
2843      return false;
2844  return true;
2845}
2846
2847/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
2848/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
2849unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
2850  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2851  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
2852
2853  unsigned Shift = (NumOperands == 4) ? 2 : 1;
2854  unsigned Mask = 0;
2855  for (int i = 0; i < NumOperands; ++i) {
2856    int Val = SVOp->getMaskElt(NumOperands-i-1);
2857    if (Val < 0) Val = 0;
2858    if (Val >= NumOperands) Val -= NumOperands;
2859    Mask |= Val;
2860    if (i != NumOperands - 1)
2861      Mask <<= Shift;
2862  }
2863  return Mask;
2864}
2865
2866/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
2867/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
2868unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
2869  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2870  unsigned Mask = 0;
2871  // 8 nodes, but we only care about the last 4.
2872  for (unsigned i = 7; i >= 4; --i) {
2873    int Val = SVOp->getMaskElt(i);
2874    if (Val >= 0)
2875      Mask |= (Val - 4);
2876    if (i != 4)
2877      Mask <<= 2;
2878  }
2879  return Mask;
2880}
2881
2882/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
2883/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
2884unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
2885  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2886  unsigned Mask = 0;
2887  // 8 nodes, but we only care about the first 4.
2888  for (int i = 3; i >= 0; --i) {
2889    int Val = SVOp->getMaskElt(i);
2890    if (Val >= 0)
2891      Mask |= Val;
2892    if (i != 0)
2893      Mask <<= 2;
2894  }
2895  return Mask;
2896}
2897
2898/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
2899/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
2900unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
2901  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2902  EVT VVT = N->getValueType(0);
2903  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
2904  int Val = 0;
2905
2906  unsigned i, e;
2907  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
2908    Val = SVOp->getMaskElt(i);
2909    if (Val >= 0)
2910      break;
2911  }
2912  return (Val - i) * EltSize;
2913}
2914
2915/// isZeroNode - Returns true if Elt is a constant zero or a floating point
2916/// constant +0.0.
2917bool X86::isZeroNode(SDValue Elt) {
2918  return ((isa<ConstantSDNode>(Elt) &&
2919           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
2920          (isa<ConstantFPSDNode>(Elt) &&
2921           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
2922}
2923
2924/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
2925/// their permute mask.
2926static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
2927                                    SelectionDAG &DAG) {
2928  EVT VT = SVOp->getValueType(0);
2929  unsigned NumElems = VT.getVectorNumElements();
2930  SmallVector<int, 8> MaskVec;
2931
2932  for (unsigned i = 0; i != NumElems; ++i) {
2933    int idx = SVOp->getMaskElt(i);
2934    if (idx < 0)
2935      MaskVec.push_back(idx);
2936    else if (idx < (int)NumElems)
2937      MaskVec.push_back(idx + NumElems);
2938    else
2939      MaskVec.push_back(idx - NumElems);
2940  }
2941  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
2942                              SVOp->getOperand(0), &MaskVec[0]);
2943}
2944
2945/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
2946/// the two vector operands have swapped position.
2947static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
2948  unsigned NumElems = VT.getVectorNumElements();
2949  for (unsigned i = 0; i != NumElems; ++i) {
2950    int idx = Mask[i];
2951    if (idx < 0)
2952      continue;
2953    else if (idx < (int)NumElems)
2954      Mask[i] = idx + NumElems;
2955    else
2956      Mask[i] = idx - NumElems;
2957  }
2958}
2959
2960/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
2961/// match movhlps. The lower half elements should come from upper half of
2962/// V1 (and in order), and the upper half elements should come from the upper
2963/// half of V2 (and in order).
2964static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
2965  if (Op->getValueType(0).getVectorNumElements() != 4)
2966    return false;
2967  for (unsigned i = 0, e = 2; i != e; ++i)
2968    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
2969      return false;
2970  for (unsigned i = 2; i != 4; ++i)
2971    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
2972      return false;
2973  return true;
2974}
2975
2976/// isScalarLoadToVector - Returns true if the node is a scalar load that
2977/// is promoted to a vector. It also returns the LoadSDNode by reference if
2978/// required.
2979static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
2980  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
2981    return false;
2982  N = N->getOperand(0).getNode();
2983  if (!ISD::isNON_EXTLoad(N))
2984    return false;
2985  if (LD)
2986    *LD = cast<LoadSDNode>(N);
2987  return true;
2988}
2989
2990/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
2991/// match movlp{s|d}. The lower half elements should come from lower half of
2992/// V1 (and in order), and the upper half elements should come from the upper
2993/// half of V2 (and in order). And since V1 will become the source of the
2994/// MOVLP, it must be either a vector load or a scalar load to vector.
2995static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
2996                               ShuffleVectorSDNode *Op) {
2997  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
2998    return false;
2999  // Is V2 is a vector load, don't do this transformation. We will try to use
3000  // load folding shufps op.
3001  if (ISD::isNON_EXTLoad(V2))
3002    return false;
3003
3004  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
3005
3006  if (NumElems != 2 && NumElems != 4)
3007    return false;
3008  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3009    if (!isUndefOrEqual(Op->getMaskElt(i), i))
3010      return false;
3011  for (unsigned i = NumElems/2; i != NumElems; ++i)
3012    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
3013      return false;
3014  return true;
3015}
3016
3017/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
3018/// all the same.
3019static bool isSplatVector(SDNode *N) {
3020  if (N->getOpcode() != ISD::BUILD_VECTOR)
3021    return false;
3022
3023  SDValue SplatValue = N->getOperand(0);
3024  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
3025    if (N->getOperand(i) != SplatValue)
3026      return false;
3027  return true;
3028}
3029
3030/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
3031/// to an zero vector.
3032/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
3033static bool isZeroShuffle(ShuffleVectorSDNode *N) {
3034  SDValue V1 = N->getOperand(0);
3035  SDValue V2 = N->getOperand(1);
3036  unsigned NumElems = N->getValueType(0).getVectorNumElements();
3037  for (unsigned i = 0; i != NumElems; ++i) {
3038    int Idx = N->getMaskElt(i);
3039    if (Idx >= (int)NumElems) {
3040      unsigned Opc = V2.getOpcode();
3041      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
3042        continue;
3043      if (Opc != ISD::BUILD_VECTOR ||
3044          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
3045        return false;
3046    } else if (Idx >= 0) {
3047      unsigned Opc = V1.getOpcode();
3048      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
3049        continue;
3050      if (Opc != ISD::BUILD_VECTOR ||
3051          !X86::isZeroNode(V1.getOperand(Idx)))
3052        return false;
3053    }
3054  }
3055  return true;
3056}
3057
3058/// getZeroVector - Returns a vector of specified type with all zero elements.
3059///
3060static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
3061                             DebugLoc dl) {
3062  assert(VT.isVector() && "Expected a vector type");
3063
3064  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3065  // type.  This ensures they get CSE'd.
3066  SDValue Vec;
3067  if (VT.getSizeInBits() == 64) { // MMX
3068    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3069    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3070  } else if (HasSSE2) {  // SSE2
3071    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3072    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3073  } else { // SSE1
3074    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3075    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
3076  }
3077  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3078}
3079
3080/// getOnesVector - Returns a vector of specified type with all bits set.
3081///
3082static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3083  assert(VT.isVector() && "Expected a vector type");
3084
3085  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3086  // type.  This ensures they get CSE'd.
3087  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
3088  SDValue Vec;
3089  if (VT.getSizeInBits() == 64)  // MMX
3090    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3091  else                                              // SSE
3092    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3093  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3094}
3095
3096
3097/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
3098/// that point to V2 points to its first element.
3099static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3100  EVT VT = SVOp->getValueType(0);
3101  unsigned NumElems = VT.getVectorNumElements();
3102
3103  bool Changed = false;
3104  SmallVector<int, 8> MaskVec;
3105  SVOp->getMask(MaskVec);
3106
3107  for (unsigned i = 0; i != NumElems; ++i) {
3108    if (MaskVec[i] > (int)NumElems) {
3109      MaskVec[i] = NumElems;
3110      Changed = true;
3111    }
3112  }
3113  if (Changed)
3114    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
3115                                SVOp->getOperand(1), &MaskVec[0]);
3116  return SDValue(SVOp, 0);
3117}
3118
3119/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
3120/// operation of specified width.
3121static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3122                       SDValue V2) {
3123  unsigned NumElems = VT.getVectorNumElements();
3124  SmallVector<int, 8> Mask;
3125  Mask.push_back(NumElems);
3126  for (unsigned i = 1; i != NumElems; ++i)
3127    Mask.push_back(i);
3128  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3129}
3130
3131/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
3132static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3133                          SDValue V2) {
3134  unsigned NumElems = VT.getVectorNumElements();
3135  SmallVector<int, 8> Mask;
3136  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3137    Mask.push_back(i);
3138    Mask.push_back(i + NumElems);
3139  }
3140  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3141}
3142
3143/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
3144static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3145                          SDValue V2) {
3146  unsigned NumElems = VT.getVectorNumElements();
3147  unsigned Half = NumElems/2;
3148  SmallVector<int, 8> Mask;
3149  for (unsigned i = 0; i != Half; ++i) {
3150    Mask.push_back(i + Half);
3151    Mask.push_back(i + NumElems + Half);
3152  }
3153  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3154}
3155
3156/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
3157static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
3158                            bool HasSSE2) {
3159  if (SV->getValueType(0).getVectorNumElements() <= 4)
3160    return SDValue(SV, 0);
3161
3162  EVT PVT = MVT::v4f32;
3163  EVT VT = SV->getValueType(0);
3164  DebugLoc dl = SV->getDebugLoc();
3165  SDValue V1 = SV->getOperand(0);
3166  int NumElems = VT.getVectorNumElements();
3167  int EltNo = SV->getSplatIndex();
3168
3169  // unpack elements to the correct location
3170  while (NumElems > 4) {
3171    if (EltNo < NumElems/2) {
3172      V1 = getUnpackl(DAG, dl, VT, V1, V1);
3173    } else {
3174      V1 = getUnpackh(DAG, dl, VT, V1, V1);
3175      EltNo -= NumElems/2;
3176    }
3177    NumElems >>= 1;
3178  }
3179
3180  // Perform the splat.
3181  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3182  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3183  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3184  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3185}
3186
3187/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3188/// vector of zero or undef vector.  This produces a shuffle where the low
3189/// element of V2 is swizzled into the zero/undef vector, landing at element
3190/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3191static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3192                                             bool isZero, bool HasSSE2,
3193                                             SelectionDAG &DAG) {
3194  EVT VT = V2.getValueType();
3195  SDValue V1 = isZero
3196    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3197  unsigned NumElems = VT.getVectorNumElements();
3198  SmallVector<int, 16> MaskVec;
3199  for (unsigned i = 0; i != NumElems; ++i)
3200    // If this is the insertion idx, put the low elt of V2 here.
3201    MaskVec.push_back(i == Idx ? NumElems : i);
3202  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3203}
3204
3205/// getNumOfConsecutiveZeros - Return the number of elements in a result of
3206/// a shuffle that is zero.
3207static
3208unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3209                                  bool Low, SelectionDAG &DAG) {
3210  unsigned NumZeros = 0;
3211  for (int i = 0; i < NumElems; ++i) {
3212    unsigned Index = Low ? i : NumElems-i-1;
3213    int Idx = SVOp->getMaskElt(Index);
3214    if (Idx < 0) {
3215      ++NumZeros;
3216      continue;
3217    }
3218    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3219    if (Elt.getNode() && X86::isZeroNode(Elt))
3220      ++NumZeros;
3221    else
3222      break;
3223  }
3224  return NumZeros;
3225}
3226
3227/// isVectorShift - Returns true if the shuffle can be implemented as a
3228/// logical left or right shift of a vector.
3229/// FIXME: split into pslldqi, psrldqi, palignr variants.
3230static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3231                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3232  int NumElems = SVOp->getValueType(0).getVectorNumElements();
3233
3234  isLeft = true;
3235  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3236  if (!NumZeros) {
3237    isLeft = false;
3238    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3239    if (!NumZeros)
3240      return false;
3241  }
3242  bool SeenV1 = false;
3243  bool SeenV2 = false;
3244  for (int i = NumZeros; i < NumElems; ++i) {
3245    int Val = isLeft ? (i - NumZeros) : i;
3246    int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3247    if (Idx < 0)
3248      continue;
3249    if (Idx < NumElems)
3250      SeenV1 = true;
3251    else {
3252      Idx -= NumElems;
3253      SeenV2 = true;
3254    }
3255    if (Idx != Val)
3256      return false;
3257  }
3258  if (SeenV1 && SeenV2)
3259    return false;
3260
3261  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3262  ShAmt = NumZeros;
3263  return true;
3264}
3265
3266
3267/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3268///
3269static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3270                                       unsigned NumNonZero, unsigned NumZero,
3271                                       SelectionDAG &DAG, TargetLowering &TLI) {
3272  if (NumNonZero > 8)
3273    return SDValue();
3274
3275  DebugLoc dl = Op.getDebugLoc();
3276  SDValue V(0, 0);
3277  bool First = true;
3278  for (unsigned i = 0; i < 16; ++i) {
3279    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3280    if (ThisIsNonZero && First) {
3281      if (NumZero)
3282        V = getZeroVector(MVT::v8i16, true, DAG, dl);
3283      else
3284        V = DAG.getUNDEF(MVT::v8i16);
3285      First = false;
3286    }
3287
3288    if ((i & 1) != 0) {
3289      SDValue ThisElt(0, 0), LastElt(0, 0);
3290      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3291      if (LastIsNonZero) {
3292        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3293                              MVT::i16, Op.getOperand(i-1));
3294      }
3295      if (ThisIsNonZero) {
3296        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3297        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3298                              ThisElt, DAG.getConstant(8, MVT::i8));
3299        if (LastIsNonZero)
3300          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3301      } else
3302        ThisElt = LastElt;
3303
3304      if (ThisElt.getNode())
3305        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3306                        DAG.getIntPtrConstant(i/2));
3307    }
3308  }
3309
3310  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3311}
3312
3313/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3314///
3315static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3316                                       unsigned NumNonZero, unsigned NumZero,
3317                                       SelectionDAG &DAG, TargetLowering &TLI) {
3318  if (NumNonZero > 4)
3319    return SDValue();
3320
3321  DebugLoc dl = Op.getDebugLoc();
3322  SDValue V(0, 0);
3323  bool First = true;
3324  for (unsigned i = 0; i < 8; ++i) {
3325    bool isNonZero = (NonZeros & (1 << i)) != 0;
3326    if (isNonZero) {
3327      if (First) {
3328        if (NumZero)
3329          V = getZeroVector(MVT::v8i16, true, DAG, dl);
3330        else
3331          V = DAG.getUNDEF(MVT::v8i16);
3332        First = false;
3333      }
3334      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3335                      MVT::v8i16, V, Op.getOperand(i),
3336                      DAG.getIntPtrConstant(i));
3337    }
3338  }
3339
3340  return V;
3341}
3342
3343/// getVShift - Return a vector logical shift node.
3344///
3345static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
3346                         unsigned NumBits, SelectionDAG &DAG,
3347                         const TargetLowering &TLI, DebugLoc dl) {
3348  bool isMMX = VT.getSizeInBits() == 64;
3349  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3350  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3351  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3352  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3353                     DAG.getNode(Opc, dl, ShVT, SrcOp,
3354                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3355}
3356
3357SDValue
3358X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
3359                                          SelectionDAG &DAG) {
3360
3361  // Check if the scalar load can be widened into a vector load. And if
3362  // the address is "base + cst" see if the cst can be "absorbed" into
3363  // the shuffle mask.
3364  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
3365    SDValue Ptr = LD->getBasePtr();
3366    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
3367      return SDValue();
3368    EVT PVT = LD->getValueType(0);
3369    if (PVT != MVT::i32 && PVT != MVT::f32)
3370      return SDValue();
3371
3372    int FI = -1;
3373    int64_t Offset = 0;
3374    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
3375      FI = FINode->getIndex();
3376      Offset = 0;
3377    } else if (Ptr.getOpcode() == ISD::ADD &&
3378               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
3379               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
3380      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
3381      Offset = Ptr.getConstantOperandVal(1);
3382      Ptr = Ptr.getOperand(0);
3383    } else {
3384      return SDValue();
3385    }
3386
3387    SDValue Chain = LD->getChain();
3388    // Make sure the stack object alignment is at least 16.
3389    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3390    if (DAG.InferPtrAlignment(Ptr) < 16) {
3391      if (MFI->isFixedObjectIndex(FI)) {
3392        // Can't change the alignment. Reference stack + offset explicitly
3393        // if stack pointer is at least 16-byte aligned.
3394        unsigned StackAlign = Subtarget->getStackAlignment();
3395        if (StackAlign < 16)
3396          return SDValue();
3397        Offset = MFI->getObjectOffset(FI) + Offset;
3398        SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
3399                                              getPointerTy());
3400        Ptr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
3401                          DAG.getConstant(Offset & ~15, getPointerTy()));
3402        Offset %= 16;
3403      } else {
3404        MFI->setObjectAlignment(FI, 16);
3405      }
3406    }
3407
3408    // (Offset % 16) must be multiple of 4. Then address is then
3409    // Ptr + (Offset & ~15).
3410    if (Offset < 0)
3411      return SDValue();
3412    if ((Offset % 16) & 3)
3413      return SDValue();
3414    int64_t StartOffset = Offset & ~15;
3415    if (StartOffset)
3416      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
3417                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
3418
3419    int EltNo = (Offset - StartOffset) >> 2;
3420    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
3421    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
3422    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0);
3423    // Canonicalize it to a v4i32 shuffle.
3424    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
3425    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3426                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
3427                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
3428  }
3429
3430  return SDValue();
3431}
3432
3433SDValue
3434X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3435  DebugLoc dl = Op.getDebugLoc();
3436  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3437  if (ISD::isBuildVectorAllZeros(Op.getNode())
3438      || ISD::isBuildVectorAllOnes(Op.getNode())) {
3439    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3440    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3441    // eliminated on x86-32 hosts.
3442    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3443      return Op;
3444
3445    if (ISD::isBuildVectorAllOnes(Op.getNode()))
3446      return getOnesVector(Op.getValueType(), DAG, dl);
3447    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3448  }
3449
3450  EVT VT = Op.getValueType();
3451  EVT ExtVT = VT.getVectorElementType();
3452  unsigned EVTBits = ExtVT.getSizeInBits();
3453
3454  unsigned NumElems = Op.getNumOperands();
3455  unsigned NumZero  = 0;
3456  unsigned NumNonZero = 0;
3457  unsigned NonZeros = 0;
3458  bool IsAllConstants = true;
3459  SmallSet<SDValue, 8> Values;
3460  for (unsigned i = 0; i < NumElems; ++i) {
3461    SDValue Elt = Op.getOperand(i);
3462    if (Elt.getOpcode() == ISD::UNDEF)
3463      continue;
3464    Values.insert(Elt);
3465    if (Elt.getOpcode() != ISD::Constant &&
3466        Elt.getOpcode() != ISD::ConstantFP)
3467      IsAllConstants = false;
3468    if (X86::isZeroNode(Elt))
3469      NumZero++;
3470    else {
3471      NonZeros |= (1 << i);
3472      NumNonZero++;
3473    }
3474  }
3475
3476  if (NumNonZero == 0) {
3477    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3478    return DAG.getUNDEF(VT);
3479  }
3480
3481  // Special case for single non-zero, non-undef, element.
3482  if (NumNonZero == 1) {
3483    unsigned Idx = CountTrailingZeros_32(NonZeros);
3484    SDValue Item = Op.getOperand(Idx);
3485
3486    // If this is an insertion of an i64 value on x86-32, and if the top bits of
3487    // the value are obviously zero, truncate the value to i32 and do the
3488    // insertion that way.  Only do this if the value is non-constant or if the
3489    // value is a constant being inserted into element 0.  It is cheaper to do
3490    // a constant pool load than it is to do a movd + shuffle.
3491    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
3492        (!IsAllConstants || Idx == 0)) {
3493      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3494        // Handle MMX and SSE both.
3495        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3496        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3497
3498        // Truncate the value (which may itself be a constant) to i32, and
3499        // convert it to a vector with movd (S2V+shuffle to zero extend).
3500        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3501        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3502        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3503                                           Subtarget->hasSSE2(), DAG);
3504
3505        // Now we have our 32-bit value zero extended in the low element of
3506        // a vector.  If Idx != 0, swizzle it into place.
3507        if (Idx != 0) {
3508          SmallVector<int, 4> Mask;
3509          Mask.push_back(Idx);
3510          for (unsigned i = 1; i != VecElts; ++i)
3511            Mask.push_back(i);
3512          Item = DAG.getVectorShuffle(VecVT, dl, Item,
3513                                      DAG.getUNDEF(Item.getValueType()),
3514                                      &Mask[0]);
3515        }
3516        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3517      }
3518    }
3519
3520    // If we have a constant or non-constant insertion into the low element of
3521    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3522    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3523    // depending on what the source datatype is.
3524    if (Idx == 0) {
3525      if (NumZero == 0) {
3526        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3527      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
3528          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
3529        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3530        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3531        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3532                                           DAG);
3533      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
3534        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3535        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3536        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3537        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3538                                           Subtarget->hasSSE2(), DAG);
3539        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3540      }
3541    }
3542
3543    // Is it a vector logical left shift?
3544    if (NumElems == 2 && Idx == 1 &&
3545        X86::isZeroNode(Op.getOperand(0)) &&
3546        !X86::isZeroNode(Op.getOperand(1))) {
3547      unsigned NumBits = VT.getSizeInBits();
3548      return getVShift(true, VT,
3549                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3550                                   VT, Op.getOperand(1)),
3551                       NumBits/2, DAG, *this, dl);
3552    }
3553
3554    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3555      return SDValue();
3556
3557    // Otherwise, if this is a vector with i32 or f32 elements, and the element
3558    // is a non-constant being inserted into an element other than the low one,
3559    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3560    // movd/movss) to move this into the low element, then shuffle it into
3561    // place.
3562    if (EVTBits == 32) {
3563      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3564
3565      // Turn it into a shuffle of zero and zero-extended scalar to vector.
3566      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3567                                         Subtarget->hasSSE2(), DAG);
3568      SmallVector<int, 8> MaskVec;
3569      for (unsigned i = 0; i < NumElems; i++)
3570        MaskVec.push_back(i == Idx ? 0 : 1);
3571      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3572    }
3573  }
3574
3575  // Splat is obviously ok. Let legalizer expand it to a shuffle.
3576  if (Values.size() == 1) {
3577    if (EVTBits == 32) {
3578      // Instead of a shuffle like this:
3579      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
3580      // Check if it's possible to issue this instead.
3581      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
3582      unsigned Idx = CountTrailingZeros_32(NonZeros);
3583      SDValue Item = Op.getOperand(Idx);
3584      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
3585        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
3586    }
3587    return SDValue();
3588  }
3589
3590  // A vector full of immediates; various special cases are already
3591  // handled, so this is best done with a single constant-pool load.
3592  if (IsAllConstants)
3593    return SDValue();
3594
3595  // Let legalizer expand 2-wide build_vectors.
3596  if (EVTBits == 64) {
3597    if (NumNonZero == 1) {
3598      // One half is zero or undef.
3599      unsigned Idx = CountTrailingZeros_32(NonZeros);
3600      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3601                                 Op.getOperand(Idx));
3602      return getShuffleVectorZeroOrUndef(V2, Idx, true,
3603                                         Subtarget->hasSSE2(), DAG);
3604    }
3605    return SDValue();
3606  }
3607
3608  // If element VT is < 32 bits, convert it to inserts into a zero vector.
3609  if (EVTBits == 8 && NumElems == 16) {
3610    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3611                                        *this);
3612    if (V.getNode()) return V;
3613  }
3614
3615  if (EVTBits == 16 && NumElems == 8) {
3616    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3617                                        *this);
3618    if (V.getNode()) return V;
3619  }
3620
3621  // If element VT is == 32 bits, turn it into a number of shuffles.
3622  SmallVector<SDValue, 8> V;
3623  V.resize(NumElems);
3624  if (NumElems == 4 && NumZero > 0) {
3625    for (unsigned i = 0; i < 4; ++i) {
3626      bool isZero = !(NonZeros & (1 << i));
3627      if (isZero)
3628        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3629      else
3630        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3631    }
3632
3633    for (unsigned i = 0; i < 2; ++i) {
3634      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3635        default: break;
3636        case 0:
3637          V[i] = V[i*2];  // Must be a zero vector.
3638          break;
3639        case 1:
3640          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3641          break;
3642        case 2:
3643          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3644          break;
3645        case 3:
3646          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3647          break;
3648      }
3649    }
3650
3651    SmallVector<int, 8> MaskVec;
3652    bool Reverse = (NonZeros & 0x3) == 2;
3653    for (unsigned i = 0; i < 2; ++i)
3654      MaskVec.push_back(Reverse ? 1-i : i);
3655    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3656    for (unsigned i = 0; i < 2; ++i)
3657      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
3658    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
3659  }
3660
3661  if (Values.size() > 2) {
3662    // If we have SSE 4.1, Expand into a number of inserts unless the number of
3663    // values to be inserted is equal to the number of elements, in which case
3664    // use the unpack code below in the hopes of matching the consecutive elts
3665    // load merge pattern for shuffles.
3666    // FIXME: We could probably just check that here directly.
3667    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
3668        getSubtarget()->hasSSE41()) {
3669      V[0] = DAG.getUNDEF(VT);
3670      for (unsigned i = 0; i < NumElems; ++i)
3671        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
3672          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
3673                             Op.getOperand(i), DAG.getIntPtrConstant(i));
3674      return V[0];
3675    }
3676    // Expand into a number of unpckl*.
3677    // e.g. for v4f32
3678    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3679    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3680    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
3681    for (unsigned i = 0; i < NumElems; ++i)
3682      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3683    NumElems >>= 1;
3684    while (NumElems != 0) {
3685      for (unsigned i = 0; i < NumElems; ++i)
3686        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
3687      NumElems >>= 1;
3688    }
3689    return V[0];
3690  }
3691
3692  return SDValue();
3693}
3694
3695// v8i16 shuffles - Prefer shuffles in the following order:
3696// 1. [all]   pshuflw, pshufhw, optional move
3697// 2. [ssse3] 1 x pshufb
3698// 3. [ssse3] 2 x pshufb + 1 x por
3699// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
3700static
3701SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
3702                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
3703  SDValue V1 = SVOp->getOperand(0);
3704  SDValue V2 = SVOp->getOperand(1);
3705  DebugLoc dl = SVOp->getDebugLoc();
3706  SmallVector<int, 8> MaskVals;
3707
3708  // Determine if more than 1 of the words in each of the low and high quadwords
3709  // of the result come from the same quadword of one of the two inputs.  Undef
3710  // mask values count as coming from any quadword, for better codegen.
3711  SmallVector<unsigned, 4> LoQuad(4);
3712  SmallVector<unsigned, 4> HiQuad(4);
3713  BitVector InputQuads(4);
3714  for (unsigned i = 0; i < 8; ++i) {
3715    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
3716    int EltIdx = SVOp->getMaskElt(i);
3717    MaskVals.push_back(EltIdx);
3718    if (EltIdx < 0) {
3719      ++Quad[0];
3720      ++Quad[1];
3721      ++Quad[2];
3722      ++Quad[3];
3723      continue;
3724    }
3725    ++Quad[EltIdx / 4];
3726    InputQuads.set(EltIdx / 4);
3727  }
3728
3729  int BestLoQuad = -1;
3730  unsigned MaxQuad = 1;
3731  for (unsigned i = 0; i < 4; ++i) {
3732    if (LoQuad[i] > MaxQuad) {
3733      BestLoQuad = i;
3734      MaxQuad = LoQuad[i];
3735    }
3736  }
3737
3738  int BestHiQuad = -1;
3739  MaxQuad = 1;
3740  for (unsigned i = 0; i < 4; ++i) {
3741    if (HiQuad[i] > MaxQuad) {
3742      BestHiQuad = i;
3743      MaxQuad = HiQuad[i];
3744    }
3745  }
3746
3747  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
3748  // of the two input vectors, shuffle them into one input vector so only a
3749  // single pshufb instruction is necessary. If There are more than 2 input
3750  // quads, disable the next transformation since it does not help SSSE3.
3751  bool V1Used = InputQuads[0] || InputQuads[1];
3752  bool V2Used = InputQuads[2] || InputQuads[3];
3753  if (TLI.getSubtarget()->hasSSSE3()) {
3754    if (InputQuads.count() == 2 && V1Used && V2Used) {
3755      BestLoQuad = InputQuads.find_first();
3756      BestHiQuad = InputQuads.find_next(BestLoQuad);
3757    }
3758    if (InputQuads.count() > 2) {
3759      BestLoQuad = -1;
3760      BestHiQuad = -1;
3761    }
3762  }
3763
3764  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
3765  // the shuffle mask.  If a quad is scored as -1, that means that it contains
3766  // words from all 4 input quadwords.
3767  SDValue NewV;
3768  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
3769    SmallVector<int, 8> MaskV;
3770    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
3771    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
3772    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
3773                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
3774                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
3775    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
3776
3777    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
3778    // source words for the shuffle, to aid later transformations.
3779    bool AllWordsInNewV = true;
3780    bool InOrder[2] = { true, true };
3781    for (unsigned i = 0; i != 8; ++i) {
3782      int idx = MaskVals[i];
3783      if (idx != (int)i)
3784        InOrder[i/4] = false;
3785      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
3786        continue;
3787      AllWordsInNewV = false;
3788      break;
3789    }
3790
3791    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
3792    if (AllWordsInNewV) {
3793      for (int i = 0; i != 8; ++i) {
3794        int idx = MaskVals[i];
3795        if (idx < 0)
3796          continue;
3797        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
3798        if ((idx != i) && idx < 4)
3799          pshufhw = false;
3800        if ((idx != i) && idx > 3)
3801          pshuflw = false;
3802      }
3803      V1 = NewV;
3804      V2Used = false;
3805      BestLoQuad = 0;
3806      BestHiQuad = 1;
3807    }
3808
3809    // If we've eliminated the use of V2, and the new mask is a pshuflw or
3810    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
3811    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
3812      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
3813                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
3814    }
3815  }
3816
3817  // If we have SSSE3, and all words of the result are from 1 input vector,
3818  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
3819  // is present, fall back to case 4.
3820  if (TLI.getSubtarget()->hasSSSE3()) {
3821    SmallVector<SDValue,16> pshufbMask;
3822
3823    // If we have elements from both input vectors, set the high bit of the
3824    // shuffle mask element to zero out elements that come from V2 in the V1
3825    // mask, and elements that come from V1 in the V2 mask, so that the two
3826    // results can be OR'd together.
3827    bool TwoInputs = V1Used && V2Used;
3828    for (unsigned i = 0; i != 8; ++i) {
3829      int EltIdx = MaskVals[i] * 2;
3830      if (TwoInputs && (EltIdx >= 16)) {
3831        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3832        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3833        continue;
3834      }
3835      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
3836      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
3837    }
3838    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
3839    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
3840                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3841                                 MVT::v16i8, &pshufbMask[0], 16));
3842    if (!TwoInputs)
3843      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3844
3845    // Calculate the shuffle mask for the second input, shuffle it, and
3846    // OR it with the first shuffled input.
3847    pshufbMask.clear();
3848    for (unsigned i = 0; i != 8; ++i) {
3849      int EltIdx = MaskVals[i] * 2;
3850      if (EltIdx < 16) {
3851        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3852        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3853        continue;
3854      }
3855      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
3856      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
3857    }
3858    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
3859    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
3860                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3861                                 MVT::v16i8, &pshufbMask[0], 16));
3862    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
3863    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3864  }
3865
3866  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
3867  // and update MaskVals with new element order.
3868  BitVector InOrder(8);
3869  if (BestLoQuad >= 0) {
3870    SmallVector<int, 8> MaskV;
3871    for (int i = 0; i != 4; ++i) {
3872      int idx = MaskVals[i];
3873      if (idx < 0) {
3874        MaskV.push_back(-1);
3875        InOrder.set(i);
3876      } else if ((idx / 4) == BestLoQuad) {
3877        MaskV.push_back(idx & 3);
3878        InOrder.set(i);
3879      } else {
3880        MaskV.push_back(-1);
3881      }
3882    }
3883    for (unsigned i = 4; i != 8; ++i)
3884      MaskV.push_back(i);
3885    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
3886                                &MaskV[0]);
3887  }
3888
3889  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
3890  // and update MaskVals with the new element order.
3891  if (BestHiQuad >= 0) {
3892    SmallVector<int, 8> MaskV;
3893    for (unsigned i = 0; i != 4; ++i)
3894      MaskV.push_back(i);
3895    for (unsigned i = 4; i != 8; ++i) {
3896      int idx = MaskVals[i];
3897      if (idx < 0) {
3898        MaskV.push_back(-1);
3899        InOrder.set(i);
3900      } else if ((idx / 4) == BestHiQuad) {
3901        MaskV.push_back((idx & 3) + 4);
3902        InOrder.set(i);
3903      } else {
3904        MaskV.push_back(-1);
3905      }
3906    }
3907    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
3908                                &MaskV[0]);
3909  }
3910
3911  // In case BestHi & BestLo were both -1, which means each quadword has a word
3912  // from each of the four input quadwords, calculate the InOrder bitvector now
3913  // before falling through to the insert/extract cleanup.
3914  if (BestLoQuad == -1 && BestHiQuad == -1) {
3915    NewV = V1;
3916    for (int i = 0; i != 8; ++i)
3917      if (MaskVals[i] < 0 || MaskVals[i] == i)
3918        InOrder.set(i);
3919  }
3920
3921  // The other elements are put in the right place using pextrw and pinsrw.
3922  for (unsigned i = 0; i != 8; ++i) {
3923    if (InOrder[i])
3924      continue;
3925    int EltIdx = MaskVals[i];
3926    if (EltIdx < 0)
3927      continue;
3928    SDValue ExtOp = (EltIdx < 8)
3929    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
3930                  DAG.getIntPtrConstant(EltIdx))
3931    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
3932                  DAG.getIntPtrConstant(EltIdx - 8));
3933    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
3934                       DAG.getIntPtrConstant(i));
3935  }
3936  return NewV;
3937}
3938
3939// v16i8 shuffles - Prefer shuffles in the following order:
3940// 1. [ssse3] 1 x pshufb
3941// 2. [ssse3] 2 x pshufb + 1 x por
3942// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
3943static
3944SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
3945                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
3946  SDValue V1 = SVOp->getOperand(0);
3947  SDValue V2 = SVOp->getOperand(1);
3948  DebugLoc dl = SVOp->getDebugLoc();
3949  SmallVector<int, 16> MaskVals;
3950  SVOp->getMask(MaskVals);
3951
3952  // If we have SSSE3, case 1 is generated when all result bytes come from
3953  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
3954  // present, fall back to case 3.
3955  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
3956  bool V1Only = true;
3957  bool V2Only = true;
3958  for (unsigned i = 0; i < 16; ++i) {
3959    int EltIdx = MaskVals[i];
3960    if (EltIdx < 0)
3961      continue;
3962    if (EltIdx < 16)
3963      V2Only = false;
3964    else
3965      V1Only = false;
3966  }
3967
3968  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
3969  if (TLI.getSubtarget()->hasSSSE3()) {
3970    SmallVector<SDValue,16> pshufbMask;
3971
3972    // If all result elements are from one input vector, then only translate
3973    // undef mask values to 0x80 (zero out result) in the pshufb mask.
3974    //
3975    // Otherwise, we have elements from both input vectors, and must zero out
3976    // elements that come from V2 in the first mask, and V1 in the second mask
3977    // so that we can OR them together.
3978    bool TwoInputs = !(V1Only || V2Only);
3979    for (unsigned i = 0; i != 16; ++i) {
3980      int EltIdx = MaskVals[i];
3981      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
3982        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3983        continue;
3984      }
3985      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
3986    }
3987    // If all the elements are from V2, assign it to V1 and return after
3988    // building the first pshufb.
3989    if (V2Only)
3990      V1 = V2;
3991    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
3992                     DAG.getNode(ISD::BUILD_VECTOR, dl,
3993                                 MVT::v16i8, &pshufbMask[0], 16));
3994    if (!TwoInputs)
3995      return V1;
3996
3997    // Calculate the shuffle mask for the second input, shuffle it, and
3998    // OR it with the first shuffled input.
3999    pshufbMask.clear();
4000    for (unsigned i = 0; i != 16; ++i) {
4001      int EltIdx = MaskVals[i];
4002      if (EltIdx < 16) {
4003        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4004        continue;
4005      }
4006      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4007    }
4008    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4009                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4010                                 MVT::v16i8, &pshufbMask[0], 16));
4011    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4012  }
4013
4014  // No SSSE3 - Calculate in place words and then fix all out of place words
4015  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
4016  // the 16 different words that comprise the two doublequadword input vectors.
4017  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4018  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
4019  SDValue NewV = V2Only ? V2 : V1;
4020  for (int i = 0; i != 8; ++i) {
4021    int Elt0 = MaskVals[i*2];
4022    int Elt1 = MaskVals[i*2+1];
4023
4024    // This word of the result is all undef, skip it.
4025    if (Elt0 < 0 && Elt1 < 0)
4026      continue;
4027
4028    // This word of the result is already in the correct place, skip it.
4029    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
4030      continue;
4031    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
4032      continue;
4033
4034    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
4035    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
4036    SDValue InsElt;
4037
4038    // If Elt0 and Elt1 are defined, are consecutive, and can be load
4039    // using a single extract together, load it and store it.
4040    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
4041      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4042                           DAG.getIntPtrConstant(Elt1 / 2));
4043      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4044                        DAG.getIntPtrConstant(i));
4045      continue;
4046    }
4047
4048    // If Elt1 is defined, extract it from the appropriate source.  If the
4049    // source byte is not also odd, shift the extracted word left 8 bits
4050    // otherwise clear the bottom 8 bits if we need to do an or.
4051    if (Elt1 >= 0) {
4052      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4053                           DAG.getIntPtrConstant(Elt1 / 2));
4054      if ((Elt1 & 1) == 0)
4055        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
4056                             DAG.getConstant(8, TLI.getShiftAmountTy()));
4057      else if (Elt0 >= 0)
4058        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
4059                             DAG.getConstant(0xFF00, MVT::i16));
4060    }
4061    // If Elt0 is defined, extract it from the appropriate source.  If the
4062    // source byte is not also even, shift the extracted word right 8 bits. If
4063    // Elt1 was also defined, OR the extracted values together before
4064    // inserting them in the result.
4065    if (Elt0 >= 0) {
4066      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
4067                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
4068      if ((Elt0 & 1) != 0)
4069        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
4070                              DAG.getConstant(8, TLI.getShiftAmountTy()));
4071      else if (Elt1 >= 0)
4072        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
4073                             DAG.getConstant(0x00FF, MVT::i16));
4074      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
4075                         : InsElt0;
4076    }
4077    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4078                       DAG.getIntPtrConstant(i));
4079  }
4080  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
4081}
4082
4083/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
4084/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
4085/// done when every pair / quad of shuffle mask elements point to elements in
4086/// the right sequence. e.g.
4087/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
4088static
4089SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
4090                                 SelectionDAG &DAG,
4091                                 TargetLowering &TLI, DebugLoc dl) {
4092  EVT VT = SVOp->getValueType(0);
4093  SDValue V1 = SVOp->getOperand(0);
4094  SDValue V2 = SVOp->getOperand(1);
4095  unsigned NumElems = VT.getVectorNumElements();
4096  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
4097  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
4098  EVT MaskEltVT = MaskVT.getVectorElementType();
4099  EVT NewVT = MaskVT;
4100  switch (VT.getSimpleVT().SimpleTy) {
4101  default: assert(false && "Unexpected!");
4102  case MVT::v4f32: NewVT = MVT::v2f64; break;
4103  case MVT::v4i32: NewVT = MVT::v2i64; break;
4104  case MVT::v8i16: NewVT = MVT::v4i32; break;
4105  case MVT::v16i8: NewVT = MVT::v4i32; break;
4106  }
4107
4108  if (NewWidth == 2) {
4109    if (VT.isInteger())
4110      NewVT = MVT::v2i64;
4111    else
4112      NewVT = MVT::v2f64;
4113  }
4114  int Scale = NumElems / NewWidth;
4115  SmallVector<int, 8> MaskVec;
4116  for (unsigned i = 0; i < NumElems; i += Scale) {
4117    int StartIdx = -1;
4118    for (int j = 0; j < Scale; ++j) {
4119      int EltIdx = SVOp->getMaskElt(i+j);
4120      if (EltIdx < 0)
4121        continue;
4122      if (StartIdx == -1)
4123        StartIdx = EltIdx - (EltIdx % Scale);
4124      if (EltIdx != StartIdx + j)
4125        return SDValue();
4126    }
4127    if (StartIdx == -1)
4128      MaskVec.push_back(-1);
4129    else
4130      MaskVec.push_back(StartIdx / Scale);
4131  }
4132
4133  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
4134  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
4135  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
4136}
4137
4138/// getVZextMovL - Return a zero-extending vector move low node.
4139///
4140static SDValue getVZextMovL(EVT VT, EVT OpVT,
4141                            SDValue SrcOp, SelectionDAG &DAG,
4142                            const X86Subtarget *Subtarget, DebugLoc dl) {
4143  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
4144    LoadSDNode *LD = NULL;
4145    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
4146      LD = dyn_cast<LoadSDNode>(SrcOp);
4147    if (!LD) {
4148      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
4149      // instead.
4150      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
4151      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
4152          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
4153          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
4154          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
4155        // PR2108
4156        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
4157        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4158                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4159                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4160                                                   OpVT,
4161                                                   SrcOp.getOperand(0)
4162                                                          .getOperand(0))));
4163      }
4164    }
4165  }
4166
4167  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4168                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4169                                 DAG.getNode(ISD::BIT_CONVERT, dl,
4170                                             OpVT, SrcOp)));
4171}
4172
4173/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
4174/// shuffles.
4175static SDValue
4176LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4177  SDValue V1 = SVOp->getOperand(0);
4178  SDValue V2 = SVOp->getOperand(1);
4179  DebugLoc dl = SVOp->getDebugLoc();
4180  EVT VT = SVOp->getValueType(0);
4181
4182  SmallVector<std::pair<int, int>, 8> Locs;
4183  Locs.resize(4);
4184  SmallVector<int, 8> Mask1(4U, -1);
4185  SmallVector<int, 8> PermMask;
4186  SVOp->getMask(PermMask);
4187
4188  unsigned NumHi = 0;
4189  unsigned NumLo = 0;
4190  for (unsigned i = 0; i != 4; ++i) {
4191    int Idx = PermMask[i];
4192    if (Idx < 0) {
4193      Locs[i] = std::make_pair(-1, -1);
4194    } else {
4195      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
4196      if (Idx < 4) {
4197        Locs[i] = std::make_pair(0, NumLo);
4198        Mask1[NumLo] = Idx;
4199        NumLo++;
4200      } else {
4201        Locs[i] = std::make_pair(1, NumHi);
4202        if (2+NumHi < 4)
4203          Mask1[2+NumHi] = Idx;
4204        NumHi++;
4205      }
4206    }
4207  }
4208
4209  if (NumLo <= 2 && NumHi <= 2) {
4210    // If no more than two elements come from either vector. This can be
4211    // implemented with two shuffles. First shuffle gather the elements.
4212    // The second shuffle, which takes the first shuffle as both of its
4213    // vector operands, put the elements into the right order.
4214    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4215
4216    SmallVector<int, 8> Mask2(4U, -1);
4217
4218    for (unsigned i = 0; i != 4; ++i) {
4219      if (Locs[i].first == -1)
4220        continue;
4221      else {
4222        unsigned Idx = (i < 2) ? 0 : 4;
4223        Idx += Locs[i].first * 2 + Locs[i].second;
4224        Mask2[i] = Idx;
4225      }
4226    }
4227
4228    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
4229  } else if (NumLo == 3 || NumHi == 3) {
4230    // Otherwise, we must have three elements from one vector, call it X, and
4231    // one element from the other, call it Y.  First, use a shufps to build an
4232    // intermediate vector with the one element from Y and the element from X
4233    // that will be in the same half in the final destination (the indexes don't
4234    // matter). Then, use a shufps to build the final vector, taking the half
4235    // containing the element from Y from the intermediate, and the other half
4236    // from X.
4237    if (NumHi == 3) {
4238      // Normalize it so the 3 elements come from V1.
4239      CommuteVectorShuffleMask(PermMask, VT);
4240      std::swap(V1, V2);
4241    }
4242
4243    // Find the element from V2.
4244    unsigned HiIndex;
4245    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
4246      int Val = PermMask[HiIndex];
4247      if (Val < 0)
4248        continue;
4249      if (Val >= 4)
4250        break;
4251    }
4252
4253    Mask1[0] = PermMask[HiIndex];
4254    Mask1[1] = -1;
4255    Mask1[2] = PermMask[HiIndex^1];
4256    Mask1[3] = -1;
4257    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4258
4259    if (HiIndex >= 2) {
4260      Mask1[0] = PermMask[0];
4261      Mask1[1] = PermMask[1];
4262      Mask1[2] = HiIndex & 1 ? 6 : 4;
4263      Mask1[3] = HiIndex & 1 ? 4 : 6;
4264      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4265    } else {
4266      Mask1[0] = HiIndex & 1 ? 2 : 0;
4267      Mask1[1] = HiIndex & 1 ? 0 : 2;
4268      Mask1[2] = PermMask[2];
4269      Mask1[3] = PermMask[3];
4270      if (Mask1[2] >= 0)
4271        Mask1[2] += 4;
4272      if (Mask1[3] >= 0)
4273        Mask1[3] += 4;
4274      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4275    }
4276  }
4277
4278  // Break it into (shuffle shuffle_hi, shuffle_lo).
4279  Locs.clear();
4280  SmallVector<int,8> LoMask(4U, -1);
4281  SmallVector<int,8> HiMask(4U, -1);
4282
4283  SmallVector<int,8> *MaskPtr = &LoMask;
4284  unsigned MaskIdx = 0;
4285  unsigned LoIdx = 0;
4286  unsigned HiIdx = 2;
4287  for (unsigned i = 0; i != 4; ++i) {
4288    if (i == 2) {
4289      MaskPtr = &HiMask;
4290      MaskIdx = 1;
4291      LoIdx = 0;
4292      HiIdx = 2;
4293    }
4294    int Idx = PermMask[i];
4295    if (Idx < 0) {
4296      Locs[i] = std::make_pair(-1, -1);
4297    } else if (Idx < 4) {
4298      Locs[i] = std::make_pair(MaskIdx, LoIdx);
4299      (*MaskPtr)[LoIdx] = Idx;
4300      LoIdx++;
4301    } else {
4302      Locs[i] = std::make_pair(MaskIdx, HiIdx);
4303      (*MaskPtr)[HiIdx] = Idx;
4304      HiIdx++;
4305    }
4306  }
4307
4308  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4309  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4310  SmallVector<int, 8> MaskOps;
4311  for (unsigned i = 0; i != 4; ++i) {
4312    if (Locs[i].first == -1) {
4313      MaskOps.push_back(-1);
4314    } else {
4315      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4316      MaskOps.push_back(Idx);
4317    }
4318  }
4319  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4320}
4321
4322SDValue
4323X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4324  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4325  SDValue V1 = Op.getOperand(0);
4326  SDValue V2 = Op.getOperand(1);
4327  EVT VT = Op.getValueType();
4328  DebugLoc dl = Op.getDebugLoc();
4329  unsigned NumElems = VT.getVectorNumElements();
4330  bool isMMX = VT.getSizeInBits() == 64;
4331  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4332  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4333  bool V1IsSplat = false;
4334  bool V2IsSplat = false;
4335
4336  if (isZeroShuffle(SVOp))
4337    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4338
4339  // Promote splats to v4f32.
4340  if (SVOp->isSplat()) {
4341    if (isMMX || NumElems < 4)
4342      return Op;
4343    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4344  }
4345
4346  // If the shuffle can be profitably rewritten as a narrower shuffle, then
4347  // do it!
4348  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4349    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4350    if (NewOp.getNode())
4351      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4352                         LowerVECTOR_SHUFFLE(NewOp, DAG));
4353  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4354    // FIXME: Figure out a cleaner way to do this.
4355    // Try to make use of movq to zero out the top part.
4356    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4357      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4358      if (NewOp.getNode()) {
4359        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4360          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4361                              DAG, Subtarget, dl);
4362      }
4363    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4364      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4365      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4366        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4367                            DAG, Subtarget, dl);
4368    }
4369  }
4370
4371  if (X86::isPSHUFDMask(SVOp))
4372    return Op;
4373
4374  // Check if this can be converted into a logical shift.
4375  bool isLeft = false;
4376  unsigned ShAmt = 0;
4377  SDValue ShVal;
4378  bool isShift = getSubtarget()->hasSSE2() &&
4379    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4380  if (isShift && ShVal.hasOneUse()) {
4381    // If the shifted value has multiple uses, it may be cheaper to use
4382    // v_set0 + movlhps or movhlps, etc.
4383    EVT EltVT = VT.getVectorElementType();
4384    ShAmt *= EltVT.getSizeInBits();
4385    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4386  }
4387
4388  if (X86::isMOVLMask(SVOp)) {
4389    if (V1IsUndef)
4390      return V2;
4391    if (ISD::isBuildVectorAllZeros(V1.getNode()))
4392      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4393    if (!isMMX)
4394      return Op;
4395  }
4396
4397  // FIXME: fold these into legal mask.
4398  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4399                 X86::isMOVSLDUPMask(SVOp) ||
4400                 X86::isMOVHLPSMask(SVOp) ||
4401                 X86::isMOVLHPSMask(SVOp) ||
4402                 X86::isMOVLPMask(SVOp)))
4403    return Op;
4404
4405  if (ShouldXformToMOVHLPS(SVOp) ||
4406      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4407    return CommuteVectorShuffle(SVOp, DAG);
4408
4409  if (isShift) {
4410    // No better options. Use a vshl / vsrl.
4411    EVT EltVT = VT.getVectorElementType();
4412    ShAmt *= EltVT.getSizeInBits();
4413    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4414  }
4415
4416  bool Commuted = false;
4417  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4418  // 1,1,1,1 -> v8i16 though.
4419  V1IsSplat = isSplatVector(V1.getNode());
4420  V2IsSplat = isSplatVector(V2.getNode());
4421
4422  // Canonicalize the splat or undef, if present, to be on the RHS.
4423  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4424    Op = CommuteVectorShuffle(SVOp, DAG);
4425    SVOp = cast<ShuffleVectorSDNode>(Op);
4426    V1 = SVOp->getOperand(0);
4427    V2 = SVOp->getOperand(1);
4428    std::swap(V1IsSplat, V2IsSplat);
4429    std::swap(V1IsUndef, V2IsUndef);
4430    Commuted = true;
4431  }
4432
4433  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4434    // Shuffling low element of v1 into undef, just return v1.
4435    if (V2IsUndef)
4436      return V1;
4437    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4438    // the instruction selector will not match, so get a canonical MOVL with
4439    // swapped operands to undo the commute.
4440    return getMOVL(DAG, dl, VT, V2, V1);
4441  }
4442
4443  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4444      X86::isUNPCKH_v_undef_Mask(SVOp) ||
4445      X86::isUNPCKLMask(SVOp) ||
4446      X86::isUNPCKHMask(SVOp))
4447    return Op;
4448
4449  if (V2IsSplat) {
4450    // Normalize mask so all entries that point to V2 points to its first
4451    // element then try to match unpck{h|l} again. If match, return a
4452    // new vector_shuffle with the corrected mask.
4453    SDValue NewMask = NormalizeMask(SVOp, DAG);
4454    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4455    if (NSVOp != SVOp) {
4456      if (X86::isUNPCKLMask(NSVOp, true)) {
4457        return NewMask;
4458      } else if (X86::isUNPCKHMask(NSVOp, true)) {
4459        return NewMask;
4460      }
4461    }
4462  }
4463
4464  if (Commuted) {
4465    // Commute is back and try unpck* again.
4466    // FIXME: this seems wrong.
4467    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4468    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4469    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4470        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4471        X86::isUNPCKLMask(NewSVOp) ||
4472        X86::isUNPCKHMask(NewSVOp))
4473      return NewOp;
4474  }
4475
4476  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4477
4478  // Normalize the node to match x86 shuffle ops if needed
4479  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4480    return CommuteVectorShuffle(SVOp, DAG);
4481
4482  // Check for legal shuffle and return?
4483  SmallVector<int, 16> PermMask;
4484  SVOp->getMask(PermMask);
4485  if (isShuffleMaskLegal(PermMask, VT))
4486    return Op;
4487
4488  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4489  if (VT == MVT::v8i16) {
4490    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4491    if (NewOp.getNode())
4492      return NewOp;
4493  }
4494
4495  if (VT == MVT::v16i8) {
4496    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4497    if (NewOp.getNode())
4498      return NewOp;
4499  }
4500
4501  // Handle all 4 wide cases with a number of shuffles except for MMX.
4502  if (NumElems == 4 && !isMMX)
4503    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4504
4505  return SDValue();
4506}
4507
4508SDValue
4509X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4510                                                SelectionDAG &DAG) {
4511  EVT VT = Op.getValueType();
4512  DebugLoc dl = Op.getDebugLoc();
4513  if (VT.getSizeInBits() == 8) {
4514    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4515                                    Op.getOperand(0), Op.getOperand(1));
4516    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4517                                    DAG.getValueType(VT));
4518    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4519  } else if (VT.getSizeInBits() == 16) {
4520    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4521    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4522    if (Idx == 0)
4523      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4524                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4525                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4526                                                 MVT::v4i32,
4527                                                 Op.getOperand(0)),
4528                                     Op.getOperand(1)));
4529    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4530                                    Op.getOperand(0), Op.getOperand(1));
4531    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4532                                    DAG.getValueType(VT));
4533    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4534  } else if (VT == MVT::f32) {
4535    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4536    // the result back to FR32 register. It's only worth matching if the
4537    // result has a single use which is a store or a bitcast to i32.  And in
4538    // the case of a store, it's not worth it if the index is a constant 0,
4539    // because a MOVSSmr can be used instead, which is smaller and faster.
4540    if (!Op.hasOneUse())
4541      return SDValue();
4542    SDNode *User = *Op.getNode()->use_begin();
4543    if ((User->getOpcode() != ISD::STORE ||
4544         (isa<ConstantSDNode>(Op.getOperand(1)) &&
4545          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4546        (User->getOpcode() != ISD::BIT_CONVERT ||
4547         User->getValueType(0) != MVT::i32))
4548      return SDValue();
4549    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4550                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4551                                              Op.getOperand(0)),
4552                                              Op.getOperand(1));
4553    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4554  } else if (VT == MVT::i32) {
4555    // ExtractPS works with constant index.
4556    if (isa<ConstantSDNode>(Op.getOperand(1)))
4557      return Op;
4558  }
4559  return SDValue();
4560}
4561
4562
4563SDValue
4564X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4565  if (!isa<ConstantSDNode>(Op.getOperand(1)))
4566    return SDValue();
4567
4568  if (Subtarget->hasSSE41()) {
4569    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4570    if (Res.getNode())
4571      return Res;
4572  }
4573
4574  EVT VT = Op.getValueType();
4575  DebugLoc dl = Op.getDebugLoc();
4576  // TODO: handle v16i8.
4577  if (VT.getSizeInBits() == 16) {
4578    SDValue Vec = Op.getOperand(0);
4579    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4580    if (Idx == 0)
4581      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4582                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4583                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4584                                                 MVT::v4i32, Vec),
4585                                     Op.getOperand(1)));
4586    // Transform it so it match pextrw which produces a 32-bit result.
4587    EVT EltVT = MVT::i32;
4588    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
4589                                    Op.getOperand(0), Op.getOperand(1));
4590    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
4591                                    DAG.getValueType(VT));
4592    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4593  } else if (VT.getSizeInBits() == 32) {
4594    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4595    if (Idx == 0)
4596      return Op;
4597
4598    // SHUFPS the element to the lowest double word, then movss.
4599    int Mask[4] = { Idx, -1, -1, -1 };
4600    EVT VVT = Op.getOperand(0).getValueType();
4601    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4602                                       DAG.getUNDEF(VVT), Mask);
4603    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4604                       DAG.getIntPtrConstant(0));
4605  } else if (VT.getSizeInBits() == 64) {
4606    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4607    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4608    //        to match extract_elt for f64.
4609    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4610    if (Idx == 0)
4611      return Op;
4612
4613    // UNPCKHPD the element to the lowest double word, then movsd.
4614    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4615    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4616    int Mask[2] = { 1, -1 };
4617    EVT VVT = Op.getOperand(0).getValueType();
4618    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4619                                       DAG.getUNDEF(VVT), Mask);
4620    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4621                       DAG.getIntPtrConstant(0));
4622  }
4623
4624  return SDValue();
4625}
4626
4627SDValue
4628X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4629  EVT VT = Op.getValueType();
4630  EVT EltVT = VT.getVectorElementType();
4631  DebugLoc dl = Op.getDebugLoc();
4632
4633  SDValue N0 = Op.getOperand(0);
4634  SDValue N1 = Op.getOperand(1);
4635  SDValue N2 = Op.getOperand(2);
4636
4637  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
4638      isa<ConstantSDNode>(N2)) {
4639    unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
4640                                                : X86ISD::PINSRW;
4641    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4642    // argument.
4643    if (N1.getValueType() != MVT::i32)
4644      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4645    if (N2.getValueType() != MVT::i32)
4646      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4647    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
4648  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4649    // Bits [7:6] of the constant are the source select.  This will always be
4650    //  zero here.  The DAG Combiner may combine an extract_elt index into these
4651    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
4652    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
4653    // Bits [5:4] of the constant are the destination select.  This is the
4654    //  value of the incoming immediate.
4655    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
4656    //   combine either bitwise AND or insert of float 0.0 to set these bits.
4657    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4658    // Create this as a scalar to vector..
4659    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
4660    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
4661  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
4662    // PINSR* works with constant index.
4663    return Op;
4664  }
4665  return SDValue();
4666}
4667
4668SDValue
4669X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4670  EVT VT = Op.getValueType();
4671  EVT EltVT = VT.getVectorElementType();
4672
4673  if (Subtarget->hasSSE41())
4674    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4675
4676  if (EltVT == MVT::i8)
4677    return SDValue();
4678
4679  DebugLoc dl = Op.getDebugLoc();
4680  SDValue N0 = Op.getOperand(0);
4681  SDValue N1 = Op.getOperand(1);
4682  SDValue N2 = Op.getOperand(2);
4683
4684  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
4685    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
4686    // as its second argument.
4687    if (N1.getValueType() != MVT::i32)
4688      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4689    if (N2.getValueType() != MVT::i32)
4690      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4691    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
4692  }
4693  return SDValue();
4694}
4695
4696SDValue
4697X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
4698  DebugLoc dl = Op.getDebugLoc();
4699  if (Op.getValueType() == MVT::v2f32)
4700    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
4701                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
4702                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
4703                                               Op.getOperand(0))));
4704
4705  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
4706    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
4707
4708  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
4709  EVT VT = MVT::v2i32;
4710  switch (Op.getValueType().getSimpleVT().SimpleTy) {
4711  default: break;
4712  case MVT::v16i8:
4713  case MVT::v8i16:
4714    VT = MVT::v4i32;
4715    break;
4716  }
4717  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
4718                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
4719}
4720
4721// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
4722// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
4723// one of the above mentioned nodes. It has to be wrapped because otherwise
4724// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
4725// be used to form addressing mode. These wrapped nodes will be selected
4726// into MOV32ri.
4727SDValue
4728X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
4729  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4730
4731  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4732  // global base reg.
4733  unsigned char OpFlag = 0;
4734  unsigned WrapperKind = X86ISD::Wrapper;
4735  CodeModel::Model M = getTargetMachine().getCodeModel();
4736
4737  if (Subtarget->isPICStyleRIPRel() &&
4738      (M == CodeModel::Small || M == CodeModel::Kernel))
4739    WrapperKind = X86ISD::WrapperRIP;
4740  else if (Subtarget->isPICStyleGOT())
4741    OpFlag = X86II::MO_GOTOFF;
4742  else if (Subtarget->isPICStyleStubPIC())
4743    OpFlag = X86II::MO_PIC_BASE_OFFSET;
4744
4745  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
4746                                             CP->getAlignment(),
4747                                             CP->getOffset(), OpFlag);
4748  DebugLoc DL = CP->getDebugLoc();
4749  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4750  // With PIC, the address is actually $g + Offset.
4751  if (OpFlag) {
4752    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4753                         DAG.getNode(X86ISD::GlobalBaseReg,
4754                                     DebugLoc::getUnknownLoc(), getPointerTy()),
4755                         Result);
4756  }
4757
4758  return Result;
4759}
4760
4761SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
4762  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4763
4764  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4765  // global base reg.
4766  unsigned char OpFlag = 0;
4767  unsigned WrapperKind = X86ISD::Wrapper;
4768  CodeModel::Model M = getTargetMachine().getCodeModel();
4769
4770  if (Subtarget->isPICStyleRIPRel() &&
4771      (M == CodeModel::Small || M == CodeModel::Kernel))
4772    WrapperKind = X86ISD::WrapperRIP;
4773  else if (Subtarget->isPICStyleGOT())
4774    OpFlag = X86II::MO_GOTOFF;
4775  else if (Subtarget->isPICStyleStubPIC())
4776    OpFlag = X86II::MO_PIC_BASE_OFFSET;
4777
4778  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
4779                                          OpFlag);
4780  DebugLoc DL = JT->getDebugLoc();
4781  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4782
4783  // With PIC, the address is actually $g + Offset.
4784  if (OpFlag) {
4785    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4786                         DAG.getNode(X86ISD::GlobalBaseReg,
4787                                     DebugLoc::getUnknownLoc(), getPointerTy()),
4788                         Result);
4789  }
4790
4791  return Result;
4792}
4793
4794SDValue
4795X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
4796  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
4797
4798  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4799  // global base reg.
4800  unsigned char OpFlag = 0;
4801  unsigned WrapperKind = X86ISD::Wrapper;
4802  CodeModel::Model M = getTargetMachine().getCodeModel();
4803
4804  if (Subtarget->isPICStyleRIPRel() &&
4805      (M == CodeModel::Small || M == CodeModel::Kernel))
4806    WrapperKind = X86ISD::WrapperRIP;
4807  else if (Subtarget->isPICStyleGOT())
4808    OpFlag = X86II::MO_GOTOFF;
4809  else if (Subtarget->isPICStyleStubPIC())
4810    OpFlag = X86II::MO_PIC_BASE_OFFSET;
4811
4812  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
4813
4814  DebugLoc DL = Op.getDebugLoc();
4815  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4816
4817
4818  // With PIC, the address is actually $g + Offset.
4819  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4820      !Subtarget->is64Bit()) {
4821    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4822                         DAG.getNode(X86ISD::GlobalBaseReg,
4823                                     DebugLoc::getUnknownLoc(),
4824                                     getPointerTy()),
4825                         Result);
4826  }
4827
4828  return Result;
4829}
4830
4831SDValue
4832X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
4833  // Create the TargetBlockAddressAddress node.
4834  unsigned char OpFlags =
4835    Subtarget->ClassifyBlockAddressReference();
4836  CodeModel::Model M = getTargetMachine().getCodeModel();
4837  BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
4838  DebugLoc dl = Op.getDebugLoc();
4839  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
4840                                       /*isTarget=*/true, OpFlags);
4841
4842  if (Subtarget->isPICStyleRIPRel() &&
4843      (M == CodeModel::Small || M == CodeModel::Kernel))
4844    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
4845  else
4846    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4847
4848  // With PIC, the address is actually $g + Offset.
4849  if (isGlobalRelativeToPICBase(OpFlags)) {
4850    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4851                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
4852                         Result);
4853  }
4854
4855  return Result;
4856}
4857
4858SDValue
4859X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
4860                                      int64_t Offset,
4861                                      SelectionDAG &DAG) const {
4862  // Create the TargetGlobalAddress node, folding in the constant
4863  // offset if it is legal.
4864  unsigned char OpFlags =
4865    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
4866  CodeModel::Model M = getTargetMachine().getCodeModel();
4867  SDValue Result;
4868  if (OpFlags == X86II::MO_NO_FLAG &&
4869      X86::isOffsetSuitableForCodeModel(Offset, M)) {
4870    // A direct static reference to a global.
4871    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
4872    Offset = 0;
4873  } else {
4874    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
4875  }
4876
4877  if (Subtarget->isPICStyleRIPRel() &&
4878      (M == CodeModel::Small || M == CodeModel::Kernel))
4879    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
4880  else
4881    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4882
4883  // With PIC, the address is actually $g + Offset.
4884  if (isGlobalRelativeToPICBase(OpFlags)) {
4885    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4886                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
4887                         Result);
4888  }
4889
4890  // For globals that require a load from a stub to get the address, emit the
4891  // load.
4892  if (isGlobalStubReference(OpFlags))
4893    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
4894                         PseudoSourceValue::getGOT(), 0);
4895
4896  // If there was a non-zero offset that we didn't fold, create an explicit
4897  // addition for it.
4898  if (Offset != 0)
4899    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
4900                         DAG.getConstant(Offset, getPointerTy()));
4901
4902  return Result;
4903}
4904
4905SDValue
4906X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
4907  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4908  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
4909  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
4910}
4911
4912static SDValue
4913GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
4914           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
4915           unsigned char OperandFlags) {
4916  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4917  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4918  DebugLoc dl = GA->getDebugLoc();
4919  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4920                                           GA->getValueType(0),
4921                                           GA->getOffset(),
4922                                           OperandFlags);
4923  if (InFlag) {
4924    SDValue Ops[] = { Chain,  TGA, *InFlag };
4925    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
4926  } else {
4927    SDValue Ops[]  = { Chain, TGA };
4928    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
4929  }
4930
4931  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
4932  MFI->setHasCalls(true);
4933
4934  SDValue Flag = Chain.getValue(1);
4935  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
4936}
4937
4938// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
4939static SDValue
4940LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4941                                const EVT PtrVT) {
4942  SDValue InFlag;
4943  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
4944  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
4945                                     DAG.getNode(X86ISD::GlobalBaseReg,
4946                                                 DebugLoc::getUnknownLoc(),
4947                                                 PtrVT), InFlag);
4948  InFlag = Chain.getValue(1);
4949
4950  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
4951}
4952
4953// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
4954static SDValue
4955LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4956                                const EVT PtrVT) {
4957  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
4958                    X86::RAX, X86II::MO_TLSGD);
4959}
4960
4961// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
4962// "local exec" model.
4963static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4964                                   const EVT PtrVT, TLSModel::Model model,
4965                                   bool is64Bit) {
4966  DebugLoc dl = GA->getDebugLoc();
4967  // Get the Thread Pointer
4968  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
4969                             DebugLoc::getUnknownLoc(), PtrVT,
4970                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
4971                                             MVT::i32));
4972
4973  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
4974                                      NULL, 0);
4975
4976  unsigned char OperandFlags = 0;
4977  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
4978  // initialexec.
4979  unsigned WrapperKind = X86ISD::Wrapper;
4980  if (model == TLSModel::LocalExec) {
4981    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
4982  } else if (is64Bit) {
4983    assert(model == TLSModel::InitialExec);
4984    OperandFlags = X86II::MO_GOTTPOFF;
4985    WrapperKind = X86ISD::WrapperRIP;
4986  } else {
4987    assert(model == TLSModel::InitialExec);
4988    OperandFlags = X86II::MO_INDNTPOFF;
4989  }
4990
4991  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
4992  // exec)
4993  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
4994                                           GA->getOffset(), OperandFlags);
4995  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
4996
4997  if (model == TLSModel::InitialExec)
4998    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
4999                         PseudoSourceValue::getGOT(), 0);
5000
5001  // The address of the thread local variable is the add of the thread
5002  // pointer with the offset of the variable.
5003  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
5004}
5005
5006SDValue
5007X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
5008  // TODO: implement the "local dynamic" model
5009  // TODO: implement the "initial exec"model for pic executables
5010  assert(Subtarget->isTargetELF() &&
5011         "TLS not implemented for non-ELF targets");
5012  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
5013  const GlobalValue *GV = GA->getGlobal();
5014
5015  // If GV is an alias then use the aliasee for determining
5016  // thread-localness.
5017  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
5018    GV = GA->resolveAliasedGlobal(false);
5019
5020  TLSModel::Model model = getTLSModel(GV,
5021                                      getTargetMachine().getRelocationModel());
5022
5023  switch (model) {
5024  case TLSModel::GeneralDynamic:
5025  case TLSModel::LocalDynamic: // not implemented
5026    if (Subtarget->is64Bit())
5027      return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
5028    return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
5029
5030  case TLSModel::InitialExec:
5031  case TLSModel::LocalExec:
5032    return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
5033                               Subtarget->is64Bit());
5034  }
5035
5036  llvm_unreachable("Unreachable");
5037  return SDValue();
5038}
5039
5040
5041/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
5042/// take a 2 x i32 value to shift plus a shift amount.
5043SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
5044  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5045  EVT VT = Op.getValueType();
5046  unsigned VTBits = VT.getSizeInBits();
5047  DebugLoc dl = Op.getDebugLoc();
5048  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
5049  SDValue ShOpLo = Op.getOperand(0);
5050  SDValue ShOpHi = Op.getOperand(1);
5051  SDValue ShAmt  = Op.getOperand(2);
5052  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
5053                                     DAG.getConstant(VTBits - 1, MVT::i8))
5054                       : DAG.getConstant(0, VT);
5055
5056  SDValue Tmp2, Tmp3;
5057  if (Op.getOpcode() == ISD::SHL_PARTS) {
5058    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
5059    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5060  } else {
5061    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
5062    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
5063  }
5064
5065  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
5066                                DAG.getConstant(VTBits, MVT::i8));
5067  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
5068                             AndNode, DAG.getConstant(0, MVT::i8));
5069
5070  SDValue Hi, Lo;
5071  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5072  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
5073  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
5074
5075  if (Op.getOpcode() == ISD::SHL_PARTS) {
5076    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5077    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5078  } else {
5079    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5080    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5081  }
5082
5083  SDValue Ops[2] = { Lo, Hi };
5084  return DAG.getMergeValues(Ops, 2, dl);
5085}
5086
5087SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5088  EVT SrcVT = Op.getOperand(0).getValueType();
5089
5090  if (SrcVT.isVector()) {
5091    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
5092      return Op;
5093    }
5094    return SDValue();
5095  }
5096
5097  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
5098         "Unknown SINT_TO_FP to lower!");
5099
5100  // These are really Legal; return the operand so the caller accepts it as
5101  // Legal.
5102  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
5103    return Op;
5104  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
5105      Subtarget->is64Bit()) {
5106    return Op;
5107  }
5108
5109  DebugLoc dl = Op.getDebugLoc();
5110  unsigned Size = SrcVT.getSizeInBits()/8;
5111  MachineFunction &MF = DAG.getMachineFunction();
5112  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
5113  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5114  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5115                               StackSlot,
5116                               PseudoSourceValue::getFixedStack(SSFI), 0);
5117  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
5118}
5119
5120SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
5121                                     SDValue StackSlot,
5122                                     SelectionDAG &DAG) {
5123  // Build the FILD
5124  DebugLoc dl = Op.getDebugLoc();
5125  SDVTList Tys;
5126  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
5127  if (useSSE)
5128    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
5129  else
5130    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
5131  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
5132  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
5133                               Tys, Ops, array_lengthof(Ops));
5134
5135  if (useSSE) {
5136    Chain = Result.getValue(1);
5137    SDValue InFlag = Result.getValue(2);
5138
5139    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
5140    // shouldn't be necessary except that RFP cannot be live across
5141    // multiple blocks. When stackifier is fixed, they can be uncoupled.
5142    MachineFunction &MF = DAG.getMachineFunction();
5143    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
5144    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5145    Tys = DAG.getVTList(MVT::Other);
5146    SDValue Ops[] = {
5147      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
5148    };
5149    Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
5150    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
5151                         PseudoSourceValue::getFixedStack(SSFI), 0);
5152  }
5153
5154  return Result;
5155}
5156
5157// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
5158SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
5159  // This algorithm is not obvious. Here it is in C code, more or less:
5160  /*
5161    double uint64_to_double( uint32_t hi, uint32_t lo ) {
5162      static const __m128i exp = { 0x4330000045300000ULL, 0 };
5163      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
5164
5165      // Copy ints to xmm registers.
5166      __m128i xh = _mm_cvtsi32_si128( hi );
5167      __m128i xl = _mm_cvtsi32_si128( lo );
5168
5169      // Combine into low half of a single xmm register.
5170      __m128i x = _mm_unpacklo_epi32( xh, xl );
5171      __m128d d;
5172      double sd;
5173
5174      // Merge in appropriate exponents to give the integer bits the right
5175      // magnitude.
5176      x = _mm_unpacklo_epi32( x, exp );
5177
5178      // Subtract away the biases to deal with the IEEE-754 double precision
5179      // implicit 1.
5180      d = _mm_sub_pd( (__m128d) x, bias );
5181
5182      // All conversions up to here are exact. The correctly rounded result is
5183      // calculated using the current rounding mode using the following
5184      // horizontal add.
5185      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
5186      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
5187                                // store doesn't really need to be here (except
5188                                // maybe to zero the other double)
5189      return sd;
5190    }
5191  */
5192
5193  DebugLoc dl = Op.getDebugLoc();
5194  LLVMContext *Context = DAG.getContext();
5195
5196  // Build some magic constants.
5197  std::vector<Constant*> CV0;
5198  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
5199  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
5200  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5201  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5202  Constant *C0 = ConstantVector::get(CV0);
5203  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
5204
5205  std::vector<Constant*> CV1;
5206  CV1.push_back(
5207    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
5208  CV1.push_back(
5209    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
5210  Constant *C1 = ConstantVector::get(CV1);
5211  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
5212
5213  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5214                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5215                                        Op.getOperand(0),
5216                                        DAG.getIntPtrConstant(1)));
5217  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5218                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5219                                        Op.getOperand(0),
5220                                        DAG.getIntPtrConstant(0)));
5221  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
5222  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
5223                              PseudoSourceValue::getConstantPool(), 0,
5224                              false, 16);
5225  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
5226  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
5227  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
5228                              PseudoSourceValue::getConstantPool(), 0,
5229                              false, 16);
5230  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
5231
5232  // Add the halves; easiest way is to swap them into another reg first.
5233  int ShufMask[2] = { 1, -1 };
5234  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
5235                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
5236  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
5237  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
5238                     DAG.getIntPtrConstant(0));
5239}
5240
5241// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
5242SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
5243  DebugLoc dl = Op.getDebugLoc();
5244  // FP constant to bias correct the final result.
5245  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
5246                                   MVT::f64);
5247
5248  // Load the 32-bit value into an XMM register.
5249  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5250                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5251                                         Op.getOperand(0),
5252                                         DAG.getIntPtrConstant(0)));
5253
5254  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5255                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
5256                     DAG.getIntPtrConstant(0));
5257
5258  // Or the load with the bias.
5259  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
5260                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5261                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5262                                                   MVT::v2f64, Load)),
5263                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5264                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5265                                                   MVT::v2f64, Bias)));
5266  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5267                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
5268                   DAG.getIntPtrConstant(0));
5269
5270  // Subtract the bias.
5271  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
5272
5273  // Handle final rounding.
5274  EVT DestVT = Op.getValueType();
5275
5276  if (DestVT.bitsLT(MVT::f64)) {
5277    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5278                       DAG.getIntPtrConstant(0));
5279  } else if (DestVT.bitsGT(MVT::f64)) {
5280    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5281  }
5282
5283  // Handle final rounding.
5284  return Sub;
5285}
5286
5287SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5288  SDValue N0 = Op.getOperand(0);
5289  DebugLoc dl = Op.getDebugLoc();
5290
5291  // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
5292  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5293  // the optimization here.
5294  if (DAG.SignBitIsZero(N0))
5295    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5296
5297  EVT SrcVT = N0.getValueType();
5298  if (SrcVT == MVT::i64) {
5299    // We only handle SSE2 f64 target here; caller can expand the rest.
5300    if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
5301      return SDValue();
5302
5303    return LowerUINT_TO_FP_i64(Op, DAG);
5304  } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
5305    return LowerUINT_TO_FP_i32(Op, DAG);
5306  }
5307
5308  assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
5309
5310  // Make a 64-bit buffer, and use it to build an FILD.
5311  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5312  SDValue WordOff = DAG.getConstant(4, getPointerTy());
5313  SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5314                                   getPointerTy(), StackSlot, WordOff);
5315  SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5316                                StackSlot, NULL, 0);
5317  SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5318                                OffsetSlot, NULL, 0);
5319  return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5320}
5321
5322std::pair<SDValue,SDValue> X86TargetLowering::
5323FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
5324  DebugLoc dl = Op.getDebugLoc();
5325
5326  EVT DstTy = Op.getValueType();
5327
5328  if (!IsSigned) {
5329    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5330    DstTy = MVT::i64;
5331  }
5332
5333  assert(DstTy.getSimpleVT() <= MVT::i64 &&
5334         DstTy.getSimpleVT() >= MVT::i16 &&
5335         "Unknown FP_TO_SINT to lower!");
5336
5337  // These are really Legal.
5338  if (DstTy == MVT::i32 &&
5339      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5340    return std::make_pair(SDValue(), SDValue());
5341  if (Subtarget->is64Bit() &&
5342      DstTy == MVT::i64 &&
5343      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5344    return std::make_pair(SDValue(), SDValue());
5345
5346  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5347  // stack slot.
5348  MachineFunction &MF = DAG.getMachineFunction();
5349  unsigned MemSize = DstTy.getSizeInBits()/8;
5350  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5351  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5352
5353  unsigned Opc;
5354  switch (DstTy.getSimpleVT().SimpleTy) {
5355  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5356  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5357  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5358  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5359  }
5360
5361  SDValue Chain = DAG.getEntryNode();
5362  SDValue Value = Op.getOperand(0);
5363  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5364    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5365    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5366                         PseudoSourceValue::getFixedStack(SSFI), 0);
5367    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5368    SDValue Ops[] = {
5369      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5370    };
5371    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5372    Chain = Value.getValue(1);
5373    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5374    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5375  }
5376
5377  // Build the FP_TO_INT*_IN_MEM
5378  SDValue Ops[] = { Chain, Value, StackSlot };
5379  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5380
5381  return std::make_pair(FIST, StackSlot);
5382}
5383
5384SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
5385  if (Op.getValueType().isVector()) {
5386    if (Op.getValueType() == MVT::v2i32 &&
5387        Op.getOperand(0).getValueType() == MVT::v2f64) {
5388      return Op;
5389    }
5390    return SDValue();
5391  }
5392
5393  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5394  SDValue FIST = Vals.first, StackSlot = Vals.second;
5395  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5396  if (FIST.getNode() == 0) return Op;
5397
5398  // Load the result.
5399  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5400                     FIST, StackSlot, NULL, 0);
5401}
5402
5403SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
5404  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5405  SDValue FIST = Vals.first, StackSlot = Vals.second;
5406  assert(FIST.getNode() && "Unexpected failure");
5407
5408  // Load the result.
5409  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5410                     FIST, StackSlot, NULL, 0);
5411}
5412
5413SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
5414  LLVMContext *Context = DAG.getContext();
5415  DebugLoc dl = Op.getDebugLoc();
5416  EVT VT = Op.getValueType();
5417  EVT EltVT = VT;
5418  if (VT.isVector())
5419    EltVT = VT.getVectorElementType();
5420  std::vector<Constant*> CV;
5421  if (EltVT == MVT::f64) {
5422    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5423    CV.push_back(C);
5424    CV.push_back(C);
5425  } else {
5426    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5427    CV.push_back(C);
5428    CV.push_back(C);
5429    CV.push_back(C);
5430    CV.push_back(C);
5431  }
5432  Constant *C = ConstantVector::get(CV);
5433  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5434  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5435                               PseudoSourceValue::getConstantPool(), 0,
5436                               false, 16);
5437  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5438}
5439
5440SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
5441  LLVMContext *Context = DAG.getContext();
5442  DebugLoc dl = Op.getDebugLoc();
5443  EVT VT = Op.getValueType();
5444  EVT EltVT = VT;
5445  if (VT.isVector())
5446    EltVT = VT.getVectorElementType();
5447  std::vector<Constant*> CV;
5448  if (EltVT == MVT::f64) {
5449    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5450    CV.push_back(C);
5451    CV.push_back(C);
5452  } else {
5453    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5454    CV.push_back(C);
5455    CV.push_back(C);
5456    CV.push_back(C);
5457    CV.push_back(C);
5458  }
5459  Constant *C = ConstantVector::get(CV);
5460  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5461  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5462                               PseudoSourceValue::getConstantPool(), 0,
5463                               false, 16);
5464  if (VT.isVector()) {
5465    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5466                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5467                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5468                                Op.getOperand(0)),
5469                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5470  } else {
5471    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5472  }
5473}
5474
5475SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
5476  LLVMContext *Context = DAG.getContext();
5477  SDValue Op0 = Op.getOperand(0);
5478  SDValue Op1 = Op.getOperand(1);
5479  DebugLoc dl = Op.getDebugLoc();
5480  EVT VT = Op.getValueType();
5481  EVT SrcVT = Op1.getValueType();
5482
5483  // If second operand is smaller, extend it first.
5484  if (SrcVT.bitsLT(VT)) {
5485    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5486    SrcVT = VT;
5487  }
5488  // And if it is bigger, shrink it first.
5489  if (SrcVT.bitsGT(VT)) {
5490    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5491    SrcVT = VT;
5492  }
5493
5494  // At this point the operands and the result should have the same
5495  // type, and that won't be f80 since that is not custom lowered.
5496
5497  // First get the sign bit of second operand.
5498  std::vector<Constant*> CV;
5499  if (SrcVT == MVT::f64) {
5500    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5501    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5502  } else {
5503    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5504    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5505    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5506    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5507  }
5508  Constant *C = ConstantVector::get(CV);
5509  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5510  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5511                                PseudoSourceValue::getConstantPool(), 0,
5512                                false, 16);
5513  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5514
5515  // Shift sign bit right or left if the two operands have different types.
5516  if (SrcVT.bitsGT(VT)) {
5517    // Op0 is MVT::f32, Op1 is MVT::f64.
5518    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5519    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5520                          DAG.getConstant(32, MVT::i32));
5521    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5522    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5523                          DAG.getIntPtrConstant(0));
5524  }
5525
5526  // Clear first operand sign bit.
5527  CV.clear();
5528  if (VT == MVT::f64) {
5529    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
5530    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5531  } else {
5532    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
5533    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5534    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5535    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5536  }
5537  C = ConstantVector::get(CV);
5538  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5539  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5540                                PseudoSourceValue::getConstantPool(), 0,
5541                                false, 16);
5542  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
5543
5544  // Or the value with the sign bit.
5545  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
5546}
5547
5548/// Emit nodes that will be selected as "test Op0,Op0", or something
5549/// equivalent.
5550SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
5551                                    SelectionDAG &DAG) {
5552  DebugLoc dl = Op.getDebugLoc();
5553
5554  // CF and OF aren't always set the way we want. Determine which
5555  // of these we need.
5556  bool NeedCF = false;
5557  bool NeedOF = false;
5558  switch (X86CC) {
5559  case X86::COND_A: case X86::COND_AE:
5560  case X86::COND_B: case X86::COND_BE:
5561    NeedCF = true;
5562    break;
5563  case X86::COND_G: case X86::COND_GE:
5564  case X86::COND_L: case X86::COND_LE:
5565  case X86::COND_O: case X86::COND_NO:
5566    NeedOF = true;
5567    break;
5568  default: break;
5569  }
5570
5571  // See if we can use the EFLAGS value from the operand instead of
5572  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
5573  // we prove that the arithmetic won't overflow, we can't use OF or CF.
5574  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
5575    unsigned Opcode = 0;
5576    unsigned NumOperands = 0;
5577    switch (Op.getNode()->getOpcode()) {
5578    case ISD::ADD:
5579      // Due to an isel shortcoming, be conservative if this add is likely to
5580      // be selected as part of a load-modify-store instruction. When the root
5581      // node in a match is a store, isel doesn't know how to remap non-chain
5582      // non-flag uses of other nodes in the match, such as the ADD in this
5583      // case. This leads to the ADD being left around and reselected, with
5584      // the result being two adds in the output.
5585      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5586           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5587        if (UI->getOpcode() == ISD::STORE)
5588          goto default_case;
5589      if (ConstantSDNode *C =
5590            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
5591        // An add of one will be selected as an INC.
5592        if (C->getAPIntValue() == 1) {
5593          Opcode = X86ISD::INC;
5594          NumOperands = 1;
5595          break;
5596        }
5597        // An add of negative one (subtract of one) will be selected as a DEC.
5598        if (C->getAPIntValue().isAllOnesValue()) {
5599          Opcode = X86ISD::DEC;
5600          NumOperands = 1;
5601          break;
5602        }
5603      }
5604      // Otherwise use a regular EFLAGS-setting add.
5605      Opcode = X86ISD::ADD;
5606      NumOperands = 2;
5607      break;
5608    case ISD::AND: {
5609      // If the primary and result isn't used, don't bother using X86ISD::AND,
5610      // because a TEST instruction will be better.
5611      bool NonFlagUse = false;
5612      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5613           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5614        if (UI->getOpcode() != ISD::BRCOND &&
5615            (UI->getOpcode() != ISD::SELECT || UI.getOperandNo() != 0) &&
5616            UI->getOpcode() != ISD::SETCC) {
5617          NonFlagUse = true;
5618          break;
5619        }
5620      if (!NonFlagUse)
5621        break;
5622    }
5623    // FALL THROUGH
5624    case ISD::SUB:
5625    case ISD::OR:
5626    case ISD::XOR:
5627      // Due to the ISEL shortcoming noted above, be conservative if this op is
5628      // likely to be selected as part of a load-modify-store instruction.
5629      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5630           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5631        if (UI->getOpcode() == ISD::STORE)
5632          goto default_case;
5633      // Otherwise use a regular EFLAGS-setting instruction.
5634      switch (Op.getNode()->getOpcode()) {
5635      case ISD::SUB: Opcode = X86ISD::SUB; break;
5636      case ISD::OR:  Opcode = X86ISD::OR;  break;
5637      case ISD::XOR: Opcode = X86ISD::XOR; break;
5638      case ISD::AND: Opcode = X86ISD::AND; break;
5639      default: llvm_unreachable("unexpected operator!");
5640      }
5641      NumOperands = 2;
5642      break;
5643    case X86ISD::ADD:
5644    case X86ISD::SUB:
5645    case X86ISD::INC:
5646    case X86ISD::DEC:
5647    case X86ISD::OR:
5648    case X86ISD::XOR:
5649    case X86ISD::AND:
5650      return SDValue(Op.getNode(), 1);
5651    default:
5652    default_case:
5653      break;
5654    }
5655    if (Opcode != 0) {
5656      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5657      SmallVector<SDValue, 4> Ops;
5658      for (unsigned i = 0; i != NumOperands; ++i)
5659        Ops.push_back(Op.getOperand(i));
5660      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
5661      DAG.ReplaceAllUsesWith(Op, New);
5662      return SDValue(New.getNode(), 1);
5663    }
5664  }
5665
5666  // Otherwise just emit a CMP with 0, which is the TEST pattern.
5667  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
5668                     DAG.getConstant(0, Op.getValueType()));
5669}
5670
5671/// Emit nodes that will be selected as "cmp Op0,Op1", or something
5672/// equivalent.
5673SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
5674                                   SelectionDAG &DAG) {
5675  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
5676    if (C->getAPIntValue() == 0)
5677      return EmitTest(Op0, X86CC, DAG);
5678
5679  DebugLoc dl = Op0.getDebugLoc();
5680  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
5681}
5682
5683SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
5684  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
5685  SDValue Op0 = Op.getOperand(0);
5686  SDValue Op1 = Op.getOperand(1);
5687  DebugLoc dl = Op.getDebugLoc();
5688  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
5689
5690  // Lower (X & (1 << N)) == 0 to BT(X, N).
5691  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
5692  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
5693  if (Op0.getOpcode() == ISD::AND &&
5694      Op0.hasOneUse() &&
5695      Op1.getOpcode() == ISD::Constant &&
5696      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
5697      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5698    SDValue LHS, RHS;
5699    if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
5700      if (ConstantSDNode *Op010C =
5701            dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
5702        if (Op010C->getZExtValue() == 1) {
5703          LHS = Op0.getOperand(0);
5704          RHS = Op0.getOperand(1).getOperand(1);
5705        }
5706    } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
5707      if (ConstantSDNode *Op000C =
5708            dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
5709        if (Op000C->getZExtValue() == 1) {
5710          LHS = Op0.getOperand(1);
5711          RHS = Op0.getOperand(0).getOperand(1);
5712        }
5713    } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
5714      ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
5715      SDValue AndLHS = Op0.getOperand(0);
5716      if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
5717        LHS = AndLHS.getOperand(0);
5718        RHS = AndLHS.getOperand(1);
5719      }
5720    }
5721
5722    if (LHS.getNode()) {
5723      // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
5724      // instruction.  Since the shift amount is in-range-or-undefined, we know
5725      // that doing a bittest on the i16 value is ok.  We extend to i32 because
5726      // the encoding for the i16 version is larger than the i32 version.
5727      if (LHS.getValueType() == MVT::i8)
5728        LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
5729
5730      // If the operand types disagree, extend the shift amount to match.  Since
5731      // BT ignores high bits (like shifts) we can use anyextend.
5732      if (LHS.getValueType() != RHS.getValueType())
5733        RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
5734
5735      SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
5736      unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
5737      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5738                         DAG.getConstant(Cond, MVT::i8), BT);
5739    }
5740  }
5741
5742  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5743  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5744  if (X86CC == X86::COND_INVALID)
5745    return SDValue();
5746
5747  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
5748
5749  // Use sbb x, x to materialize carry bit into a GPR.
5750  if (X86CC == X86::COND_B)
5751    return DAG.getNode(ISD::AND, dl, MVT::i8,
5752                       DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
5753                                   DAG.getConstant(X86CC, MVT::i8), Cond),
5754                       DAG.getConstant(1, MVT::i8));
5755
5756  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5757                     DAG.getConstant(X86CC, MVT::i8), Cond);
5758}
5759
5760SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5761  SDValue Cond;
5762  SDValue Op0 = Op.getOperand(0);
5763  SDValue Op1 = Op.getOperand(1);
5764  SDValue CC = Op.getOperand(2);
5765  EVT VT = Op.getValueType();
5766  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5767  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5768  DebugLoc dl = Op.getDebugLoc();
5769
5770  if (isFP) {
5771    unsigned SSECC = 8;
5772    EVT VT0 = Op0.getValueType();
5773    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
5774    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
5775    bool Swap = false;
5776
5777    switch (SetCCOpcode) {
5778    default: break;
5779    case ISD::SETOEQ:
5780    case ISD::SETEQ:  SSECC = 0; break;
5781    case ISD::SETOGT:
5782    case ISD::SETGT: Swap = true; // Fallthrough
5783    case ISD::SETLT:
5784    case ISD::SETOLT: SSECC = 1; break;
5785    case ISD::SETOGE:
5786    case ISD::SETGE: Swap = true; // Fallthrough
5787    case ISD::SETLE:
5788    case ISD::SETOLE: SSECC = 2; break;
5789    case ISD::SETUO:  SSECC = 3; break;
5790    case ISD::SETUNE:
5791    case ISD::SETNE:  SSECC = 4; break;
5792    case ISD::SETULE: Swap = true;
5793    case ISD::SETUGE: SSECC = 5; break;
5794    case ISD::SETULT: Swap = true;
5795    case ISD::SETUGT: SSECC = 6; break;
5796    case ISD::SETO:   SSECC = 7; break;
5797    }
5798    if (Swap)
5799      std::swap(Op0, Op1);
5800
5801    // In the two special cases we can't handle, emit two comparisons.
5802    if (SSECC == 8) {
5803      if (SetCCOpcode == ISD::SETUEQ) {
5804        SDValue UNORD, EQ;
5805        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
5806        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
5807        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
5808      }
5809      else if (SetCCOpcode == ISD::SETONE) {
5810        SDValue ORD, NEQ;
5811        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
5812        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
5813        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
5814      }
5815      llvm_unreachable("Illegal FP comparison");
5816    }
5817    // Handle all other FP comparisons here.
5818    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
5819  }
5820
5821  // We are handling one of the integer comparisons here.  Since SSE only has
5822  // GT and EQ comparisons for integer, swapping operands and multiple
5823  // operations may be required for some comparisons.
5824  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
5825  bool Swap = false, Invert = false, FlipSigns = false;
5826
5827  switch (VT.getSimpleVT().SimpleTy) {
5828  default: break;
5829  case MVT::v8i8:
5830  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
5831  case MVT::v4i16:
5832  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
5833  case MVT::v2i32:
5834  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
5835  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
5836  }
5837
5838  switch (SetCCOpcode) {
5839  default: break;
5840  case ISD::SETNE:  Invert = true;
5841  case ISD::SETEQ:  Opc = EQOpc; break;
5842  case ISD::SETLT:  Swap = true;
5843  case ISD::SETGT:  Opc = GTOpc; break;
5844  case ISD::SETGE:  Swap = true;
5845  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
5846  case ISD::SETULT: Swap = true;
5847  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
5848  case ISD::SETUGE: Swap = true;
5849  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
5850  }
5851  if (Swap)
5852    std::swap(Op0, Op1);
5853
5854  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
5855  // bits of the inputs before performing those operations.
5856  if (FlipSigns) {
5857    EVT EltVT = VT.getVectorElementType();
5858    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
5859                                      EltVT);
5860    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
5861    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
5862                                    SignBits.size());
5863    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
5864    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
5865  }
5866
5867  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
5868
5869  // If the logical-not of the result is required, perform that now.
5870  if (Invert)
5871    Result = DAG.getNOT(dl, Result, VT);
5872
5873  return Result;
5874}
5875
5876// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
5877static bool isX86LogicalCmp(SDValue Op) {
5878  unsigned Opc = Op.getNode()->getOpcode();
5879  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
5880    return true;
5881  if (Op.getResNo() == 1 &&
5882      (Opc == X86ISD::ADD ||
5883       Opc == X86ISD::SUB ||
5884       Opc == X86ISD::SMUL ||
5885       Opc == X86ISD::UMUL ||
5886       Opc == X86ISD::INC ||
5887       Opc == X86ISD::DEC ||
5888       Opc == X86ISD::OR ||
5889       Opc == X86ISD::XOR ||
5890       Opc == X86ISD::AND))
5891    return true;
5892
5893  return false;
5894}
5895
5896SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
5897  bool addTest = true;
5898  SDValue Cond  = Op.getOperand(0);
5899  DebugLoc dl = Op.getDebugLoc();
5900  SDValue CC;
5901
5902  if (Cond.getOpcode() == ISD::SETCC) {
5903    SDValue NewCond = LowerSETCC(Cond, DAG);
5904    if (NewCond.getNode())
5905      Cond = NewCond;
5906  }
5907
5908  // Look pass (and (setcc_carry (cmp ...)), 1).
5909  if (Cond.getOpcode() == ISD::AND &&
5910      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
5911    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5912    if (C && C->getAPIntValue() == 1)
5913      Cond = Cond.getOperand(0);
5914  }
5915
5916  // If condition flag is set by a X86ISD::CMP, then use it as the condition
5917  // setting operand in place of the X86ISD::SETCC.
5918  if (Cond.getOpcode() == X86ISD::SETCC ||
5919      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
5920    CC = Cond.getOperand(0);
5921
5922    SDValue Cmp = Cond.getOperand(1);
5923    unsigned Opc = Cmp.getOpcode();
5924    EVT VT = Op.getValueType();
5925
5926    bool IllegalFPCMov = false;
5927    if (VT.isFloatingPoint() && !VT.isVector() &&
5928        !isScalarFPTypeInSSEReg(VT))  // FPStack?
5929      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
5930
5931    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
5932        Opc == X86ISD::BT) { // FIXME
5933      Cond = Cmp;
5934      addTest = false;
5935    }
5936  }
5937
5938  if (addTest) {
5939    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5940    Cond = EmitTest(Cond, X86::COND_NE, DAG);
5941  }
5942
5943  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
5944  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
5945  // condition is true.
5946  SDValue Ops[] = { Op.getOperand(2), Op.getOperand(1), CC, Cond };
5947  return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
5948}
5949
5950// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
5951// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
5952// from the AND / OR.
5953static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
5954  Opc = Op.getOpcode();
5955  if (Opc != ISD::OR && Opc != ISD::AND)
5956    return false;
5957  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5958          Op.getOperand(0).hasOneUse() &&
5959          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
5960          Op.getOperand(1).hasOneUse());
5961}
5962
5963// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
5964// 1 and that the SETCC node has a single use.
5965static bool isXor1OfSetCC(SDValue Op) {
5966  if (Op.getOpcode() != ISD::XOR)
5967    return false;
5968  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5969  if (N1C && N1C->getAPIntValue() == 1) {
5970    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5971      Op.getOperand(0).hasOneUse();
5972  }
5973  return false;
5974}
5975
5976SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5977  bool addTest = true;
5978  SDValue Chain = Op.getOperand(0);
5979  SDValue Cond  = Op.getOperand(1);
5980  SDValue Dest  = Op.getOperand(2);
5981  DebugLoc dl = Op.getDebugLoc();
5982  SDValue CC;
5983
5984  if (Cond.getOpcode() == ISD::SETCC) {
5985    SDValue NewCond = LowerSETCC(Cond, DAG);
5986    if (NewCond.getNode())
5987      Cond = NewCond;
5988  }
5989#if 0
5990  // FIXME: LowerXALUO doesn't handle these!!
5991  else if (Cond.getOpcode() == X86ISD::ADD  ||
5992           Cond.getOpcode() == X86ISD::SUB  ||
5993           Cond.getOpcode() == X86ISD::SMUL ||
5994           Cond.getOpcode() == X86ISD::UMUL)
5995    Cond = LowerXALUO(Cond, DAG);
5996#endif
5997
5998  // Look pass (and (setcc_carry (cmp ...)), 1).
5999  if (Cond.getOpcode() == ISD::AND &&
6000      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6001    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6002    if (C && C->getAPIntValue() == 1)
6003      Cond = Cond.getOperand(0);
6004  }
6005
6006  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6007  // setting operand in place of the X86ISD::SETCC.
6008  if (Cond.getOpcode() == X86ISD::SETCC ||
6009      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6010    CC = Cond.getOperand(0);
6011
6012    SDValue Cmp = Cond.getOperand(1);
6013    unsigned Opc = Cmp.getOpcode();
6014    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
6015    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
6016      Cond = Cmp;
6017      addTest = false;
6018    } else {
6019      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
6020      default: break;
6021      case X86::COND_O:
6022      case X86::COND_B:
6023        // These can only come from an arithmetic instruction with overflow,
6024        // e.g. SADDO, UADDO.
6025        Cond = Cond.getNode()->getOperand(1);
6026        addTest = false;
6027        break;
6028      }
6029    }
6030  } else {
6031    unsigned CondOpc;
6032    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
6033      SDValue Cmp = Cond.getOperand(0).getOperand(1);
6034      if (CondOpc == ISD::OR) {
6035        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
6036        // two branches instead of an explicit OR instruction with a
6037        // separate test.
6038        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6039            isX86LogicalCmp(Cmp)) {
6040          CC = Cond.getOperand(0).getOperand(0);
6041          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6042                              Chain, Dest, CC, Cmp);
6043          CC = Cond.getOperand(1).getOperand(0);
6044          Cond = Cmp;
6045          addTest = false;
6046        }
6047      } else { // ISD::AND
6048        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
6049        // two branches instead of an explicit AND instruction with a
6050        // separate test. However, we only do this if this block doesn't
6051        // have a fall-through edge, because this requires an explicit
6052        // jmp when the condition is false.
6053        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6054            isX86LogicalCmp(Cmp) &&
6055            Op.getNode()->hasOneUse()) {
6056          X86::CondCode CCode =
6057            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6058          CCode = X86::GetOppositeBranchCondition(CCode);
6059          CC = DAG.getConstant(CCode, MVT::i8);
6060          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
6061          // Look for an unconditional branch following this conditional branch.
6062          // We need this because we need to reverse the successors in order
6063          // to implement FCMP_OEQ.
6064          if (User.getOpcode() == ISD::BR) {
6065            SDValue FalseBB = User.getOperand(1);
6066            SDValue NewBR =
6067              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
6068            assert(NewBR == User);
6069            Dest = FalseBB;
6070
6071            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6072                                Chain, Dest, CC, Cmp);
6073            X86::CondCode CCode =
6074              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
6075            CCode = X86::GetOppositeBranchCondition(CCode);
6076            CC = DAG.getConstant(CCode, MVT::i8);
6077            Cond = Cmp;
6078            addTest = false;
6079          }
6080        }
6081      }
6082    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
6083      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
6084      // It should be transformed during dag combiner except when the condition
6085      // is set by a arithmetics with overflow node.
6086      X86::CondCode CCode =
6087        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6088      CCode = X86::GetOppositeBranchCondition(CCode);
6089      CC = DAG.getConstant(CCode, MVT::i8);
6090      Cond = Cond.getOperand(0).getOperand(1);
6091      addTest = false;
6092    }
6093  }
6094
6095  if (addTest) {
6096    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6097    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6098  }
6099  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6100                     Chain, Dest, CC, Cond);
6101}
6102
6103
6104// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
6105// Calls to _alloca is needed to probe the stack when allocating more than 4k
6106// bytes in one go. Touching the stack at 4K increments is necessary to ensure
6107// that the guard pages used by the OS virtual memory manager are allocated in
6108// correct sequence.
6109SDValue
6110X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
6111                                           SelectionDAG &DAG) {
6112  assert(Subtarget->isTargetCygMing() &&
6113         "This should be used only on Cygwin/Mingw targets");
6114  DebugLoc dl = Op.getDebugLoc();
6115
6116  // Get the inputs.
6117  SDValue Chain = Op.getOperand(0);
6118  SDValue Size  = Op.getOperand(1);
6119  // FIXME: Ensure alignment here
6120
6121  SDValue Flag;
6122
6123  EVT IntPtr = getPointerTy();
6124  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
6125
6126  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
6127
6128  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
6129  Flag = Chain.getValue(1);
6130
6131  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
6132  SDValue Ops[] = { Chain,
6133                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
6134                      DAG.getRegister(X86::EAX, IntPtr),
6135                      DAG.getRegister(X86StackPtr, SPTy),
6136                      Flag };
6137  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
6138  Flag = Chain.getValue(1);
6139
6140  Chain = DAG.getCALLSEQ_END(Chain,
6141                             DAG.getIntPtrConstant(0, true),
6142                             DAG.getIntPtrConstant(0, true),
6143                             Flag);
6144
6145  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
6146
6147  SDValue Ops1[2] = { Chain.getValue(0), Chain };
6148  return DAG.getMergeValues(Ops1, 2, dl);
6149}
6150
6151SDValue
6152X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
6153                                           SDValue Chain,
6154                                           SDValue Dst, SDValue Src,
6155                                           SDValue Size, unsigned Align,
6156                                           const Value *DstSV,
6157                                           uint64_t DstSVOff) {
6158  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
6159
6160  // If not DWORD aligned or size is more than the threshold, call the library.
6161  // The libc version is likely to be faster for these cases. It can use the
6162  // address value and run time information about the CPU.
6163  if ((Align & 3) != 0 ||
6164      !ConstantSize ||
6165      ConstantSize->getZExtValue() >
6166        getSubtarget()->getMaxInlineSizeThreshold()) {
6167    SDValue InFlag(0, 0);
6168
6169    // Check to see if there is a specialized entry-point for memory zeroing.
6170    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
6171
6172    if (const char *bzeroEntry =  V &&
6173        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
6174      EVT IntPtr = getPointerTy();
6175      const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext());
6176      TargetLowering::ArgListTy Args;
6177      TargetLowering::ArgListEntry Entry;
6178      Entry.Node = Dst;
6179      Entry.Ty = IntPtrTy;
6180      Args.push_back(Entry);
6181      Entry.Node = Size;
6182      Args.push_back(Entry);
6183      std::pair<SDValue,SDValue> CallResult =
6184        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
6185                    false, false, false, false,
6186                    0, CallingConv::C, false, /*isReturnValueUsed=*/false,
6187                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl,
6188                    DAG.GetOrdering(Chain.getNode()));
6189      return CallResult.second;
6190    }
6191
6192    // Otherwise have the target-independent code call memset.
6193    return SDValue();
6194  }
6195
6196  uint64_t SizeVal = ConstantSize->getZExtValue();
6197  SDValue InFlag(0, 0);
6198  EVT AVT;
6199  SDValue Count;
6200  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
6201  unsigned BytesLeft = 0;
6202  bool TwoRepStos = false;
6203  if (ValC) {
6204    unsigned ValReg;
6205    uint64_t Val = ValC->getZExtValue() & 255;
6206
6207    // If the value is a constant, then we can potentially use larger sets.
6208    switch (Align & 3) {
6209    case 2:   // WORD aligned
6210      AVT = MVT::i16;
6211      ValReg = X86::AX;
6212      Val = (Val << 8) | Val;
6213      break;
6214    case 0:  // DWORD aligned
6215      AVT = MVT::i32;
6216      ValReg = X86::EAX;
6217      Val = (Val << 8)  | Val;
6218      Val = (Val << 16) | Val;
6219      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
6220        AVT = MVT::i64;
6221        ValReg = X86::RAX;
6222        Val = (Val << 32) | Val;
6223      }
6224      break;
6225    default:  // Byte aligned
6226      AVT = MVT::i8;
6227      ValReg = X86::AL;
6228      Count = DAG.getIntPtrConstant(SizeVal);
6229      break;
6230    }
6231
6232    if (AVT.bitsGT(MVT::i8)) {
6233      unsigned UBytes = AVT.getSizeInBits() / 8;
6234      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
6235      BytesLeft = SizeVal % UBytes;
6236    }
6237
6238    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
6239                              InFlag);
6240    InFlag = Chain.getValue(1);
6241  } else {
6242    AVT = MVT::i8;
6243    Count  = DAG.getIntPtrConstant(SizeVal);
6244    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
6245    InFlag = Chain.getValue(1);
6246  }
6247
6248  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
6249                                                              X86::ECX,
6250                            Count, InFlag);
6251  InFlag = Chain.getValue(1);
6252  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6253                                                              X86::EDI,
6254                            Dst, InFlag);
6255  InFlag = Chain.getValue(1);
6256
6257  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6258  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
6259  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
6260
6261  if (TwoRepStos) {
6262    InFlag = Chain.getValue(1);
6263    Count  = Size;
6264    EVT CVT = Count.getValueType();
6265    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
6266                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
6267    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
6268                                                             X86::ECX,
6269                              Left, InFlag);
6270    InFlag = Chain.getValue(1);
6271    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6272    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
6273    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
6274  } else if (BytesLeft) {
6275    // Handle the last 1 - 7 bytes.
6276    unsigned Offset = SizeVal - BytesLeft;
6277    EVT AddrVT = Dst.getValueType();
6278    EVT SizeVT = Size.getValueType();
6279
6280    Chain = DAG.getMemset(Chain, dl,
6281                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
6282                                      DAG.getConstant(Offset, AddrVT)),
6283                          Src,
6284                          DAG.getConstant(BytesLeft, SizeVT),
6285                          Align, DstSV, DstSVOff + Offset);
6286  }
6287
6288  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
6289  return Chain;
6290}
6291
6292SDValue
6293X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
6294                                      SDValue Chain, SDValue Dst, SDValue Src,
6295                                      SDValue Size, unsigned Align,
6296                                      bool AlwaysInline,
6297                                      const Value *DstSV, uint64_t DstSVOff,
6298                                      const Value *SrcSV, uint64_t SrcSVOff) {
6299  // This requires the copy size to be a constant, preferrably
6300  // within a subtarget-specific limit.
6301  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
6302  if (!ConstantSize)
6303    return SDValue();
6304  uint64_t SizeVal = ConstantSize->getZExtValue();
6305  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
6306    return SDValue();
6307
6308  /// If not DWORD aligned, call the library.
6309  if ((Align & 3) != 0)
6310    return SDValue();
6311
6312  // DWORD aligned
6313  EVT AVT = MVT::i32;
6314  if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
6315    AVT = MVT::i64;
6316
6317  unsigned UBytes = AVT.getSizeInBits() / 8;
6318  unsigned CountVal = SizeVal / UBytes;
6319  SDValue Count = DAG.getIntPtrConstant(CountVal);
6320  unsigned BytesLeft = SizeVal % UBytes;
6321
6322  SDValue InFlag(0, 0);
6323  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
6324                                                              X86::ECX,
6325                            Count, InFlag);
6326  InFlag = Chain.getValue(1);
6327  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6328                                                             X86::EDI,
6329                            Dst, InFlag);
6330  InFlag = Chain.getValue(1);
6331  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
6332                                                              X86::ESI,
6333                            Src, InFlag);
6334  InFlag = Chain.getValue(1);
6335
6336  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6337  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
6338  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
6339                                array_lengthof(Ops));
6340
6341  SmallVector<SDValue, 4> Results;
6342  Results.push_back(RepMovs);
6343  if (BytesLeft) {
6344    // Handle the last 1 - 7 bytes.
6345    unsigned Offset = SizeVal - BytesLeft;
6346    EVT DstVT = Dst.getValueType();
6347    EVT SrcVT = Src.getValueType();
6348    EVT SizeVT = Size.getValueType();
6349    Results.push_back(DAG.getMemcpy(Chain, dl,
6350                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
6351                                                DAG.getConstant(Offset, DstVT)),
6352                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
6353                                                DAG.getConstant(Offset, SrcVT)),
6354                                    DAG.getConstant(BytesLeft, SizeVT),
6355                                    Align, AlwaysInline,
6356                                    DstSV, DstSVOff + Offset,
6357                                    SrcSV, SrcSVOff + Offset));
6358  }
6359
6360  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6361                     &Results[0], Results.size());
6362}
6363
6364SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
6365  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6366  DebugLoc dl = Op.getDebugLoc();
6367
6368  if (!Subtarget->is64Bit()) {
6369    // vastart just stores the address of the VarArgsFrameIndex slot into the
6370    // memory location argument.
6371    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6372    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
6373  }
6374
6375  // __va_list_tag:
6376  //   gp_offset         (0 - 6 * 8)
6377  //   fp_offset         (48 - 48 + 8 * 16)
6378  //   overflow_arg_area (point to parameters coming in memory).
6379  //   reg_save_area
6380  SmallVector<SDValue, 8> MemOps;
6381  SDValue FIN = Op.getOperand(1);
6382  // Store gp_offset
6383  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6384                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
6385                                 FIN, SV, 0);
6386  MemOps.push_back(Store);
6387
6388  // Store fp_offset
6389  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6390                    FIN, DAG.getIntPtrConstant(4));
6391  Store = DAG.getStore(Op.getOperand(0), dl,
6392                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
6393                       FIN, SV, 0);
6394  MemOps.push_back(Store);
6395
6396  // Store ptr to overflow_arg_area
6397  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6398                    FIN, DAG.getIntPtrConstant(4));
6399  SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6400  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
6401  MemOps.push_back(Store);
6402
6403  // Store ptr to reg_save_area.
6404  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6405                    FIN, DAG.getIntPtrConstant(8));
6406  SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
6407  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
6408  MemOps.push_back(Store);
6409  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6410                     &MemOps[0], MemOps.size());
6411}
6412
6413SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
6414  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6415  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6416  SDValue Chain = Op.getOperand(0);
6417  SDValue SrcPtr = Op.getOperand(1);
6418  SDValue SrcSV = Op.getOperand(2);
6419
6420  llvm_report_error("VAArgInst is not yet implemented for x86-64!");
6421  return SDValue();
6422}
6423
6424SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
6425  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6426  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6427  SDValue Chain = Op.getOperand(0);
6428  SDValue DstPtr = Op.getOperand(1);
6429  SDValue SrcPtr = Op.getOperand(2);
6430  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6431  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6432  DebugLoc dl = Op.getDebugLoc();
6433
6434  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6435                       DAG.getIntPtrConstant(24), 8, false,
6436                       DstSV, 0, SrcSV, 0);
6437}
6438
6439SDValue
6440X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
6441  DebugLoc dl = Op.getDebugLoc();
6442  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6443  switch (IntNo) {
6444  default: return SDValue();    // Don't custom lower most intrinsics.
6445  // Comparison intrinsics.
6446  case Intrinsic::x86_sse_comieq_ss:
6447  case Intrinsic::x86_sse_comilt_ss:
6448  case Intrinsic::x86_sse_comile_ss:
6449  case Intrinsic::x86_sse_comigt_ss:
6450  case Intrinsic::x86_sse_comige_ss:
6451  case Intrinsic::x86_sse_comineq_ss:
6452  case Intrinsic::x86_sse_ucomieq_ss:
6453  case Intrinsic::x86_sse_ucomilt_ss:
6454  case Intrinsic::x86_sse_ucomile_ss:
6455  case Intrinsic::x86_sse_ucomigt_ss:
6456  case Intrinsic::x86_sse_ucomige_ss:
6457  case Intrinsic::x86_sse_ucomineq_ss:
6458  case Intrinsic::x86_sse2_comieq_sd:
6459  case Intrinsic::x86_sse2_comilt_sd:
6460  case Intrinsic::x86_sse2_comile_sd:
6461  case Intrinsic::x86_sse2_comigt_sd:
6462  case Intrinsic::x86_sse2_comige_sd:
6463  case Intrinsic::x86_sse2_comineq_sd:
6464  case Intrinsic::x86_sse2_ucomieq_sd:
6465  case Intrinsic::x86_sse2_ucomilt_sd:
6466  case Intrinsic::x86_sse2_ucomile_sd:
6467  case Intrinsic::x86_sse2_ucomigt_sd:
6468  case Intrinsic::x86_sse2_ucomige_sd:
6469  case Intrinsic::x86_sse2_ucomineq_sd: {
6470    unsigned Opc = 0;
6471    ISD::CondCode CC = ISD::SETCC_INVALID;
6472    switch (IntNo) {
6473    default: break;
6474    case Intrinsic::x86_sse_comieq_ss:
6475    case Intrinsic::x86_sse2_comieq_sd:
6476      Opc = X86ISD::COMI;
6477      CC = ISD::SETEQ;
6478      break;
6479    case Intrinsic::x86_sse_comilt_ss:
6480    case Intrinsic::x86_sse2_comilt_sd:
6481      Opc = X86ISD::COMI;
6482      CC = ISD::SETLT;
6483      break;
6484    case Intrinsic::x86_sse_comile_ss:
6485    case Intrinsic::x86_sse2_comile_sd:
6486      Opc = X86ISD::COMI;
6487      CC = ISD::SETLE;
6488      break;
6489    case Intrinsic::x86_sse_comigt_ss:
6490    case Intrinsic::x86_sse2_comigt_sd:
6491      Opc = X86ISD::COMI;
6492      CC = ISD::SETGT;
6493      break;
6494    case Intrinsic::x86_sse_comige_ss:
6495    case Intrinsic::x86_sse2_comige_sd:
6496      Opc = X86ISD::COMI;
6497      CC = ISD::SETGE;
6498      break;
6499    case Intrinsic::x86_sse_comineq_ss:
6500    case Intrinsic::x86_sse2_comineq_sd:
6501      Opc = X86ISD::COMI;
6502      CC = ISD::SETNE;
6503      break;
6504    case Intrinsic::x86_sse_ucomieq_ss:
6505    case Intrinsic::x86_sse2_ucomieq_sd:
6506      Opc = X86ISD::UCOMI;
6507      CC = ISD::SETEQ;
6508      break;
6509    case Intrinsic::x86_sse_ucomilt_ss:
6510    case Intrinsic::x86_sse2_ucomilt_sd:
6511      Opc = X86ISD::UCOMI;
6512      CC = ISD::SETLT;
6513      break;
6514    case Intrinsic::x86_sse_ucomile_ss:
6515    case Intrinsic::x86_sse2_ucomile_sd:
6516      Opc = X86ISD::UCOMI;
6517      CC = ISD::SETLE;
6518      break;
6519    case Intrinsic::x86_sse_ucomigt_ss:
6520    case Intrinsic::x86_sse2_ucomigt_sd:
6521      Opc = X86ISD::UCOMI;
6522      CC = ISD::SETGT;
6523      break;
6524    case Intrinsic::x86_sse_ucomige_ss:
6525    case Intrinsic::x86_sse2_ucomige_sd:
6526      Opc = X86ISD::UCOMI;
6527      CC = ISD::SETGE;
6528      break;
6529    case Intrinsic::x86_sse_ucomineq_ss:
6530    case Intrinsic::x86_sse2_ucomineq_sd:
6531      Opc = X86ISD::UCOMI;
6532      CC = ISD::SETNE;
6533      break;
6534    }
6535
6536    SDValue LHS = Op.getOperand(1);
6537    SDValue RHS = Op.getOperand(2);
6538    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6539    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
6540    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6541    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6542                                DAG.getConstant(X86CC, MVT::i8), Cond);
6543    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6544  }
6545  // ptest intrinsics. The intrinsic these come from are designed to return
6546  // an integer value, not just an instruction so lower it to the ptest
6547  // pattern and a setcc for the result.
6548  case Intrinsic::x86_sse41_ptestz:
6549  case Intrinsic::x86_sse41_ptestc:
6550  case Intrinsic::x86_sse41_ptestnzc:{
6551    unsigned X86CC = 0;
6552    switch (IntNo) {
6553    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6554    case Intrinsic::x86_sse41_ptestz:
6555      // ZF = 1
6556      X86CC = X86::COND_E;
6557      break;
6558    case Intrinsic::x86_sse41_ptestc:
6559      // CF = 1
6560      X86CC = X86::COND_B;
6561      break;
6562    case Intrinsic::x86_sse41_ptestnzc:
6563      // ZF and CF = 0
6564      X86CC = X86::COND_A;
6565      break;
6566    }
6567
6568    SDValue LHS = Op.getOperand(1);
6569    SDValue RHS = Op.getOperand(2);
6570    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6571    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6572    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6573    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6574  }
6575
6576  // Fix vector shift instructions where the last operand is a non-immediate
6577  // i32 value.
6578  case Intrinsic::x86_sse2_pslli_w:
6579  case Intrinsic::x86_sse2_pslli_d:
6580  case Intrinsic::x86_sse2_pslli_q:
6581  case Intrinsic::x86_sse2_psrli_w:
6582  case Intrinsic::x86_sse2_psrli_d:
6583  case Intrinsic::x86_sse2_psrli_q:
6584  case Intrinsic::x86_sse2_psrai_w:
6585  case Intrinsic::x86_sse2_psrai_d:
6586  case Intrinsic::x86_mmx_pslli_w:
6587  case Intrinsic::x86_mmx_pslli_d:
6588  case Intrinsic::x86_mmx_pslli_q:
6589  case Intrinsic::x86_mmx_psrli_w:
6590  case Intrinsic::x86_mmx_psrli_d:
6591  case Intrinsic::x86_mmx_psrli_q:
6592  case Intrinsic::x86_mmx_psrai_w:
6593  case Intrinsic::x86_mmx_psrai_d: {
6594    SDValue ShAmt = Op.getOperand(2);
6595    if (isa<ConstantSDNode>(ShAmt))
6596      return SDValue();
6597
6598    unsigned NewIntNo = 0;
6599    EVT ShAmtVT = MVT::v4i32;
6600    switch (IntNo) {
6601    case Intrinsic::x86_sse2_pslli_w:
6602      NewIntNo = Intrinsic::x86_sse2_psll_w;
6603      break;
6604    case Intrinsic::x86_sse2_pslli_d:
6605      NewIntNo = Intrinsic::x86_sse2_psll_d;
6606      break;
6607    case Intrinsic::x86_sse2_pslli_q:
6608      NewIntNo = Intrinsic::x86_sse2_psll_q;
6609      break;
6610    case Intrinsic::x86_sse2_psrli_w:
6611      NewIntNo = Intrinsic::x86_sse2_psrl_w;
6612      break;
6613    case Intrinsic::x86_sse2_psrli_d:
6614      NewIntNo = Intrinsic::x86_sse2_psrl_d;
6615      break;
6616    case Intrinsic::x86_sse2_psrli_q:
6617      NewIntNo = Intrinsic::x86_sse2_psrl_q;
6618      break;
6619    case Intrinsic::x86_sse2_psrai_w:
6620      NewIntNo = Intrinsic::x86_sse2_psra_w;
6621      break;
6622    case Intrinsic::x86_sse2_psrai_d:
6623      NewIntNo = Intrinsic::x86_sse2_psra_d;
6624      break;
6625    default: {
6626      ShAmtVT = MVT::v2i32;
6627      switch (IntNo) {
6628      case Intrinsic::x86_mmx_pslli_w:
6629        NewIntNo = Intrinsic::x86_mmx_psll_w;
6630        break;
6631      case Intrinsic::x86_mmx_pslli_d:
6632        NewIntNo = Intrinsic::x86_mmx_psll_d;
6633        break;
6634      case Intrinsic::x86_mmx_pslli_q:
6635        NewIntNo = Intrinsic::x86_mmx_psll_q;
6636        break;
6637      case Intrinsic::x86_mmx_psrli_w:
6638        NewIntNo = Intrinsic::x86_mmx_psrl_w;
6639        break;
6640      case Intrinsic::x86_mmx_psrli_d:
6641        NewIntNo = Intrinsic::x86_mmx_psrl_d;
6642        break;
6643      case Intrinsic::x86_mmx_psrli_q:
6644        NewIntNo = Intrinsic::x86_mmx_psrl_q;
6645        break;
6646      case Intrinsic::x86_mmx_psrai_w:
6647        NewIntNo = Intrinsic::x86_mmx_psra_w;
6648        break;
6649      case Intrinsic::x86_mmx_psrai_d:
6650        NewIntNo = Intrinsic::x86_mmx_psra_d;
6651        break;
6652      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
6653      }
6654      break;
6655    }
6656    }
6657
6658    // The vector shift intrinsics with scalars uses 32b shift amounts but
6659    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
6660    // to be zero.
6661    SDValue ShOps[4];
6662    ShOps[0] = ShAmt;
6663    ShOps[1] = DAG.getConstant(0, MVT::i32);
6664    if (ShAmtVT == MVT::v4i32) {
6665      ShOps[2] = DAG.getUNDEF(MVT::i32);
6666      ShOps[3] = DAG.getUNDEF(MVT::i32);
6667      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
6668    } else {
6669      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
6670    }
6671
6672    EVT VT = Op.getValueType();
6673    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
6674    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6675                       DAG.getConstant(NewIntNo, MVT::i32),
6676                       Op.getOperand(1), ShAmt);
6677  }
6678  }
6679}
6680
6681SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
6682  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6683  DebugLoc dl = Op.getDebugLoc();
6684
6685  if (Depth > 0) {
6686    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6687    SDValue Offset =
6688      DAG.getConstant(TD->getPointerSize(),
6689                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
6690    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6691                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
6692                                   FrameAddr, Offset),
6693                       NULL, 0);
6694  }
6695
6696  // Just load the return address.
6697  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
6698  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6699                     RetAddrFI, NULL, 0);
6700}
6701
6702SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
6703  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6704  MFI->setFrameAddressIsTaken(true);
6705  EVT VT = Op.getValueType();
6706  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
6707  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6708  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
6709  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6710  while (Depth--)
6711    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
6712  return FrameAddr;
6713}
6714
6715SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
6716                                                     SelectionDAG &DAG) {
6717  return DAG.getIntPtrConstant(2*TD->getPointerSize());
6718}
6719
6720SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
6721{
6722  MachineFunction &MF = DAG.getMachineFunction();
6723  SDValue Chain     = Op.getOperand(0);
6724  SDValue Offset    = Op.getOperand(1);
6725  SDValue Handler   = Op.getOperand(2);
6726  DebugLoc dl       = Op.getDebugLoc();
6727
6728  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
6729                                  getPointerTy());
6730  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
6731
6732  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
6733                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
6734  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
6735  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
6736  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
6737  MF.getRegInfo().addLiveOut(StoreAddrReg);
6738
6739  return DAG.getNode(X86ISD::EH_RETURN, dl,
6740                     MVT::Other,
6741                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
6742}
6743
6744SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
6745                                             SelectionDAG &DAG) {
6746  SDValue Root = Op.getOperand(0);
6747  SDValue Trmp = Op.getOperand(1); // trampoline
6748  SDValue FPtr = Op.getOperand(2); // nested function
6749  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
6750  DebugLoc dl  = Op.getDebugLoc();
6751
6752  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6753
6754  const X86InstrInfo *TII =
6755    ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
6756
6757  if (Subtarget->is64Bit()) {
6758    SDValue OutChains[6];
6759
6760    // Large code-model.
6761
6762    const unsigned char JMP64r  = TII->getBaseOpcodeFor(X86::JMP64r);
6763    const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
6764
6765    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
6766    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
6767
6768    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
6769
6770    // Load the pointer to the nested function into R11.
6771    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
6772    SDValue Addr = Trmp;
6773    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6774                                Addr, TrmpAddr, 0);
6775
6776    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6777                       DAG.getConstant(2, MVT::i64));
6778    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
6779
6780    // Load the 'nest' parameter value into R10.
6781    // R10 is specified in X86CallingConv.td
6782    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
6783    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6784                       DAG.getConstant(10, MVT::i64));
6785    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6786                                Addr, TrmpAddr, 10);
6787
6788    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6789                       DAG.getConstant(12, MVT::i64));
6790    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
6791
6792    // Jump to the nested function.
6793    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
6794    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6795                       DAG.getConstant(20, MVT::i64));
6796    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6797                                Addr, TrmpAddr, 20);
6798
6799    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
6800    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6801                       DAG.getConstant(22, MVT::i64));
6802    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
6803                                TrmpAddr, 22);
6804
6805    SDValue Ops[] =
6806      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
6807    return DAG.getMergeValues(Ops, 2, dl);
6808  } else {
6809    const Function *Func =
6810      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
6811    CallingConv::ID CC = Func->getCallingConv();
6812    unsigned NestReg;
6813
6814    switch (CC) {
6815    default:
6816      llvm_unreachable("Unsupported calling convention");
6817    case CallingConv::C:
6818    case CallingConv::X86_StdCall: {
6819      // Pass 'nest' parameter in ECX.
6820      // Must be kept in sync with X86CallingConv.td
6821      NestReg = X86::ECX;
6822
6823      // Check that ECX wasn't needed by an 'inreg' parameter.
6824      const FunctionType *FTy = Func->getFunctionType();
6825      const AttrListPtr &Attrs = Func->getAttributes();
6826
6827      if (!Attrs.isEmpty() && !Func->isVarArg()) {
6828        unsigned InRegCount = 0;
6829        unsigned Idx = 1;
6830
6831        for (FunctionType::param_iterator I = FTy->param_begin(),
6832             E = FTy->param_end(); I != E; ++I, ++Idx)
6833          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
6834            // FIXME: should only count parameters that are lowered to integers.
6835            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
6836
6837        if (InRegCount > 2) {
6838          llvm_report_error("Nest register in use - reduce number of inreg parameters!");
6839        }
6840      }
6841      break;
6842    }
6843    case CallingConv::X86_FastCall:
6844    case CallingConv::Fast:
6845      // Pass 'nest' parameter in EAX.
6846      // Must be kept in sync with X86CallingConv.td
6847      NestReg = X86::EAX;
6848      break;
6849    }
6850
6851    SDValue OutChains[4];
6852    SDValue Addr, Disp;
6853
6854    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6855                       DAG.getConstant(10, MVT::i32));
6856    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
6857
6858    const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
6859    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
6860    OutChains[0] = DAG.getStore(Root, dl,
6861                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
6862                                Trmp, TrmpAddr, 0);
6863
6864    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6865                       DAG.getConstant(1, MVT::i32));
6866    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
6867
6868    const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
6869    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6870                       DAG.getConstant(5, MVT::i32));
6871    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
6872                                TrmpAddr, 5, false, 1);
6873
6874    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6875                       DAG.getConstant(6, MVT::i32));
6876    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
6877
6878    SDValue Ops[] =
6879      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
6880    return DAG.getMergeValues(Ops, 2, dl);
6881  }
6882}
6883
6884SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
6885  /*
6886   The rounding mode is in bits 11:10 of FPSR, and has the following
6887   settings:
6888     00 Round to nearest
6889     01 Round to -inf
6890     10 Round to +inf
6891     11 Round to 0
6892
6893  FLT_ROUNDS, on the other hand, expects the following:
6894    -1 Undefined
6895     0 Round to 0
6896     1 Round to nearest
6897     2 Round to +inf
6898     3 Round to -inf
6899
6900  To perform the conversion, we do:
6901    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
6902  */
6903
6904  MachineFunction &MF = DAG.getMachineFunction();
6905  const TargetMachine &TM = MF.getTarget();
6906  const TargetFrameInfo &TFI = *TM.getFrameInfo();
6907  unsigned StackAlignment = TFI.getStackAlignment();
6908  EVT VT = Op.getValueType();
6909  DebugLoc dl = Op.getDebugLoc();
6910
6911  // Save FP Control Word to stack slot
6912  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
6913  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6914
6915  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
6916                              DAG.getEntryNode(), StackSlot);
6917
6918  // Load FP Control Word from stack slot
6919  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
6920
6921  // Transform as necessary
6922  SDValue CWD1 =
6923    DAG.getNode(ISD::SRL, dl, MVT::i16,
6924                DAG.getNode(ISD::AND, dl, MVT::i16,
6925                            CWD, DAG.getConstant(0x800, MVT::i16)),
6926                DAG.getConstant(11, MVT::i8));
6927  SDValue CWD2 =
6928    DAG.getNode(ISD::SRL, dl, MVT::i16,
6929                DAG.getNode(ISD::AND, dl, MVT::i16,
6930                            CWD, DAG.getConstant(0x400, MVT::i16)),
6931                DAG.getConstant(9, MVT::i8));
6932
6933  SDValue RetVal =
6934    DAG.getNode(ISD::AND, dl, MVT::i16,
6935                DAG.getNode(ISD::ADD, dl, MVT::i16,
6936                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
6937                            DAG.getConstant(1, MVT::i16)),
6938                DAG.getConstant(3, MVT::i16));
6939
6940
6941  return DAG.getNode((VT.getSizeInBits() < 16 ?
6942                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
6943}
6944
6945SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
6946  EVT VT = Op.getValueType();
6947  EVT OpVT = VT;
6948  unsigned NumBits = VT.getSizeInBits();
6949  DebugLoc dl = Op.getDebugLoc();
6950
6951  Op = Op.getOperand(0);
6952  if (VT == MVT::i8) {
6953    // Zero extend to i32 since there is not an i8 bsr.
6954    OpVT = MVT::i32;
6955    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6956  }
6957
6958  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
6959  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6960  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
6961
6962  // If src is zero (i.e. bsr sets ZF), returns NumBits.
6963  SDValue Ops[] = {
6964    Op,
6965    DAG.getConstant(NumBits+NumBits-1, OpVT),
6966    DAG.getConstant(X86::COND_E, MVT::i8),
6967    Op.getValue(1)
6968  };
6969  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
6970
6971  // Finally xor with NumBits-1.
6972  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
6973
6974  if (VT == MVT::i8)
6975    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6976  return Op;
6977}
6978
6979SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
6980  EVT VT = Op.getValueType();
6981  EVT OpVT = VT;
6982  unsigned NumBits = VT.getSizeInBits();
6983  DebugLoc dl = Op.getDebugLoc();
6984
6985  Op = Op.getOperand(0);
6986  if (VT == MVT::i8) {
6987    OpVT = MVT::i32;
6988    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6989  }
6990
6991  // Issue a bsf (scan bits forward) which also sets EFLAGS.
6992  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6993  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
6994
6995  // If src is zero (i.e. bsf sets ZF), returns NumBits.
6996  SDValue Ops[] = {
6997    Op,
6998    DAG.getConstant(NumBits, OpVT),
6999    DAG.getConstant(X86::COND_E, MVT::i8),
7000    Op.getValue(1)
7001  };
7002  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7003
7004  if (VT == MVT::i8)
7005    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7006  return Op;
7007}
7008
7009SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
7010  EVT VT = Op.getValueType();
7011  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
7012  DebugLoc dl = Op.getDebugLoc();
7013
7014  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
7015  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
7016  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
7017  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
7018  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
7019  //
7020  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
7021  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
7022  //  return AloBlo + AloBhi + AhiBlo;
7023
7024  SDValue A = Op.getOperand(0);
7025  SDValue B = Op.getOperand(1);
7026
7027  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7028                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7029                       A, DAG.getConstant(32, MVT::i32));
7030  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7031                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7032                       B, DAG.getConstant(32, MVT::i32));
7033  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7034                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7035                       A, B);
7036  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7037                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7038                       A, Bhi);
7039  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7040                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7041                       Ahi, B);
7042  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7043                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7044                       AloBhi, DAG.getConstant(32, MVT::i32));
7045  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7046                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7047                       AhiBlo, DAG.getConstant(32, MVT::i32));
7048  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
7049  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
7050  return Res;
7051}
7052
7053
7054SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
7055  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
7056  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
7057  // looks for this combo and may remove the "setcc" instruction if the "setcc"
7058  // has only one use.
7059  SDNode *N = Op.getNode();
7060  SDValue LHS = N->getOperand(0);
7061  SDValue RHS = N->getOperand(1);
7062  unsigned BaseOp = 0;
7063  unsigned Cond = 0;
7064  DebugLoc dl = Op.getDebugLoc();
7065
7066  switch (Op.getOpcode()) {
7067  default: llvm_unreachable("Unknown ovf instruction!");
7068  case ISD::SADDO:
7069    // A subtract of one will be selected as a INC. Note that INC doesn't
7070    // set CF, so we can't do this for UADDO.
7071    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7072      if (C->getAPIntValue() == 1) {
7073        BaseOp = X86ISD::INC;
7074        Cond = X86::COND_O;
7075        break;
7076      }
7077    BaseOp = X86ISD::ADD;
7078    Cond = X86::COND_O;
7079    break;
7080  case ISD::UADDO:
7081    BaseOp = X86ISD::ADD;
7082    Cond = X86::COND_B;
7083    break;
7084  case ISD::SSUBO:
7085    // A subtract of one will be selected as a DEC. Note that DEC doesn't
7086    // set CF, so we can't do this for USUBO.
7087    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7088      if (C->getAPIntValue() == 1) {
7089        BaseOp = X86ISD::DEC;
7090        Cond = X86::COND_O;
7091        break;
7092      }
7093    BaseOp = X86ISD::SUB;
7094    Cond = X86::COND_O;
7095    break;
7096  case ISD::USUBO:
7097    BaseOp = X86ISD::SUB;
7098    Cond = X86::COND_B;
7099    break;
7100  case ISD::SMULO:
7101    BaseOp = X86ISD::SMUL;
7102    Cond = X86::COND_O;
7103    break;
7104  case ISD::UMULO:
7105    BaseOp = X86ISD::UMUL;
7106    Cond = X86::COND_B;
7107    break;
7108  }
7109
7110  // Also sets EFLAGS.
7111  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
7112  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
7113
7114  SDValue SetCC =
7115    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
7116                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
7117
7118  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
7119  return Sum;
7120}
7121
7122SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
7123  EVT T = Op.getValueType();
7124  DebugLoc dl = Op.getDebugLoc();
7125  unsigned Reg = 0;
7126  unsigned size = 0;
7127  switch(T.getSimpleVT().SimpleTy) {
7128  default:
7129    assert(false && "Invalid value type!");
7130  case MVT::i8:  Reg = X86::AL;  size = 1; break;
7131  case MVT::i16: Reg = X86::AX;  size = 2; break;
7132  case MVT::i32: Reg = X86::EAX; size = 4; break;
7133  case MVT::i64:
7134    assert(Subtarget->is64Bit() && "Node not type legal!");
7135    Reg = X86::RAX; size = 8;
7136    break;
7137  }
7138  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
7139                                    Op.getOperand(2), SDValue());
7140  SDValue Ops[] = { cpIn.getValue(0),
7141                    Op.getOperand(1),
7142                    Op.getOperand(3),
7143                    DAG.getTargetConstant(size, MVT::i8),
7144                    cpIn.getValue(1) };
7145  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7146  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
7147  SDValue cpOut =
7148    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
7149  return cpOut;
7150}
7151
7152SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
7153                                                 SelectionDAG &DAG) {
7154  assert(Subtarget->is64Bit() && "Result not type legalized?");
7155  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7156  SDValue TheChain = Op.getOperand(0);
7157  DebugLoc dl = Op.getDebugLoc();
7158  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7159  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
7160  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
7161                                   rax.getValue(2));
7162  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
7163                            DAG.getConstant(32, MVT::i8));
7164  SDValue Ops[] = {
7165    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
7166    rdx.getValue(1)
7167  };
7168  return DAG.getMergeValues(Ops, 2, dl);
7169}
7170
7171SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
7172  SDNode *Node = Op.getNode();
7173  DebugLoc dl = Node->getDebugLoc();
7174  EVT T = Node->getValueType(0);
7175  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
7176                              DAG.getConstant(0, T), Node->getOperand(2));
7177  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
7178                       cast<AtomicSDNode>(Node)->getMemoryVT(),
7179                       Node->getOperand(0),
7180                       Node->getOperand(1), negOp,
7181                       cast<AtomicSDNode>(Node)->getSrcValue(),
7182                       cast<AtomicSDNode>(Node)->getAlignment());
7183}
7184
7185/// LowerOperation - Provide custom lowering hooks for some operations.
7186///
7187SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
7188  switch (Op.getOpcode()) {
7189  default: llvm_unreachable("Should not custom lower this!");
7190  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
7191  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
7192  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
7193  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
7194  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7195  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
7196  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
7197  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
7198  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
7199  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
7200  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
7201  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
7202  case ISD::SHL_PARTS:
7203  case ISD::SRA_PARTS:
7204  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
7205  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
7206  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
7207  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
7208  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
7209  case ISD::FABS:               return LowerFABS(Op, DAG);
7210  case ISD::FNEG:               return LowerFNEG(Op, DAG);
7211  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
7212  case ISD::SETCC:              return LowerSETCC(Op, DAG);
7213  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
7214  case ISD::SELECT:             return LowerSELECT(Op, DAG);
7215  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
7216  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
7217  case ISD::VASTART:            return LowerVASTART(Op, DAG);
7218  case ISD::VAARG:              return LowerVAARG(Op, DAG);
7219  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
7220  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7221  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
7222  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
7223  case ISD::FRAME_TO_ARGS_OFFSET:
7224                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
7225  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
7226  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
7227  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
7228  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
7229  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
7230  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
7231  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
7232  case ISD::SADDO:
7233  case ISD::UADDO:
7234  case ISD::SSUBO:
7235  case ISD::USUBO:
7236  case ISD::SMULO:
7237  case ISD::UMULO:              return LowerXALUO(Op, DAG);
7238  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
7239  }
7240}
7241
7242void X86TargetLowering::
7243ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
7244                        SelectionDAG &DAG, unsigned NewOp) {
7245  EVT T = Node->getValueType(0);
7246  DebugLoc dl = Node->getDebugLoc();
7247  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
7248
7249  SDValue Chain = Node->getOperand(0);
7250  SDValue In1 = Node->getOperand(1);
7251  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7252                             Node->getOperand(2), DAG.getIntPtrConstant(0));
7253  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7254                             Node->getOperand(2), DAG.getIntPtrConstant(1));
7255  SDValue Ops[] = { Chain, In1, In2L, In2H };
7256  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
7257  SDValue Result =
7258    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
7259                            cast<MemSDNode>(Node)->getMemOperand());
7260  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
7261  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7262  Results.push_back(Result.getValue(2));
7263}
7264
7265/// ReplaceNodeResults - Replace a node with an illegal result type
7266/// with a new node built out of custom code.
7267void X86TargetLowering::ReplaceNodeResults(SDNode *N,
7268                                           SmallVectorImpl<SDValue>&Results,
7269                                           SelectionDAG &DAG) {
7270  DebugLoc dl = N->getDebugLoc();
7271  switch (N->getOpcode()) {
7272  default:
7273    assert(false && "Do not know how to custom type legalize this operation!");
7274    return;
7275  case ISD::FP_TO_SINT: {
7276    std::pair<SDValue,SDValue> Vals =
7277        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
7278    SDValue FIST = Vals.first, StackSlot = Vals.second;
7279    if (FIST.getNode() != 0) {
7280      EVT VT = N->getValueType(0);
7281      // Return a load from the stack slot.
7282      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
7283    }
7284    return;
7285  }
7286  case ISD::READCYCLECOUNTER: {
7287    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7288    SDValue TheChain = N->getOperand(0);
7289    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7290    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
7291                                     rd.getValue(1));
7292    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
7293                                     eax.getValue(2));
7294    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
7295    SDValue Ops[] = { eax, edx };
7296    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
7297    Results.push_back(edx.getValue(1));
7298    return;
7299  }
7300  case ISD::SDIV:
7301  case ISD::UDIV:
7302  case ISD::SREM:
7303  case ISD::UREM: {
7304    EVT WidenVT = getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
7305    Results.push_back(DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements()));
7306    return;
7307  }
7308  case ISD::ATOMIC_CMP_SWAP: {
7309    EVT T = N->getValueType(0);
7310    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
7311    SDValue cpInL, cpInH;
7312    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7313                        DAG.getConstant(0, MVT::i32));
7314    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7315                        DAG.getConstant(1, MVT::i32));
7316    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
7317    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
7318                             cpInL.getValue(1));
7319    SDValue swapInL, swapInH;
7320    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7321                          DAG.getConstant(0, MVT::i32));
7322    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7323                          DAG.getConstant(1, MVT::i32));
7324    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
7325                               cpInH.getValue(1));
7326    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
7327                               swapInL.getValue(1));
7328    SDValue Ops[] = { swapInH.getValue(0),
7329                      N->getOperand(1),
7330                      swapInH.getValue(1) };
7331    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7332    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
7333    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
7334                                        MVT::i32, Result.getValue(1));
7335    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
7336                                        MVT::i32, cpOutL.getValue(2));
7337    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
7338    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7339    Results.push_back(cpOutH.getValue(1));
7340    return;
7341  }
7342  case ISD::ATOMIC_LOAD_ADD:
7343    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
7344    return;
7345  case ISD::ATOMIC_LOAD_AND:
7346    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
7347    return;
7348  case ISD::ATOMIC_LOAD_NAND:
7349    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
7350    return;
7351  case ISD::ATOMIC_LOAD_OR:
7352    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
7353    return;
7354  case ISD::ATOMIC_LOAD_SUB:
7355    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
7356    return;
7357  case ISD::ATOMIC_LOAD_XOR:
7358    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
7359    return;
7360  case ISD::ATOMIC_SWAP:
7361    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
7362    return;
7363  }
7364}
7365
7366const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7367  switch (Opcode) {
7368  default: return NULL;
7369  case X86ISD::BSF:                return "X86ISD::BSF";
7370  case X86ISD::BSR:                return "X86ISD::BSR";
7371  case X86ISD::SHLD:               return "X86ISD::SHLD";
7372  case X86ISD::SHRD:               return "X86ISD::SHRD";
7373  case X86ISD::FAND:               return "X86ISD::FAND";
7374  case X86ISD::FOR:                return "X86ISD::FOR";
7375  case X86ISD::FXOR:               return "X86ISD::FXOR";
7376  case X86ISD::FSRL:               return "X86ISD::FSRL";
7377  case X86ISD::FILD:               return "X86ISD::FILD";
7378  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
7379  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7380  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7381  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7382  case X86ISD::FLD:                return "X86ISD::FLD";
7383  case X86ISD::FST:                return "X86ISD::FST";
7384  case X86ISD::CALL:               return "X86ISD::CALL";
7385  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
7386  case X86ISD::BT:                 return "X86ISD::BT";
7387  case X86ISD::CMP:                return "X86ISD::CMP";
7388  case X86ISD::COMI:               return "X86ISD::COMI";
7389  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
7390  case X86ISD::SETCC:              return "X86ISD::SETCC";
7391  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
7392  case X86ISD::CMOV:               return "X86ISD::CMOV";
7393  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
7394  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
7395  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
7396  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
7397  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
7398  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
7399  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
7400  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
7401  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
7402  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
7403  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
7404  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
7405  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
7406  case X86ISD::FMAX:               return "X86ISD::FMAX";
7407  case X86ISD::FMIN:               return "X86ISD::FMIN";
7408  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
7409  case X86ISD::FRCP:               return "X86ISD::FRCP";
7410  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
7411  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7412  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
7413  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
7414  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
7415  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
7416  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
7417  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
7418  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
7419  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
7420  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
7421  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
7422  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
7423  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
7424  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
7425  case X86ISD::VSHL:               return "X86ISD::VSHL";
7426  case X86ISD::VSRL:               return "X86ISD::VSRL";
7427  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
7428  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
7429  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
7430  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
7431  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
7432  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
7433  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
7434  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
7435  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
7436  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
7437  case X86ISD::ADD:                return "X86ISD::ADD";
7438  case X86ISD::SUB:                return "X86ISD::SUB";
7439  case X86ISD::SMUL:               return "X86ISD::SMUL";
7440  case X86ISD::UMUL:               return "X86ISD::UMUL";
7441  case X86ISD::INC:                return "X86ISD::INC";
7442  case X86ISD::DEC:                return "X86ISD::DEC";
7443  case X86ISD::OR:                 return "X86ISD::OR";
7444  case X86ISD::XOR:                return "X86ISD::XOR";
7445  case X86ISD::AND:                return "X86ISD::AND";
7446  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
7447  case X86ISD::PTEST:              return "X86ISD::PTEST";
7448  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
7449  }
7450}
7451
7452// isLegalAddressingMode - Return true if the addressing mode represented
7453// by AM is legal for this target, for a load/store of the specified type.
7454bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7455                                              const Type *Ty) const {
7456  // X86 supports extremely general addressing modes.
7457  CodeModel::Model M = getTargetMachine().getCodeModel();
7458
7459  // X86 allows a sign-extended 32-bit immediate field as a displacement.
7460  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7461    return false;
7462
7463  if (AM.BaseGV) {
7464    unsigned GVFlags =
7465      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7466
7467    // If a reference to this global requires an extra load, we can't fold it.
7468    if (isGlobalStubReference(GVFlags))
7469      return false;
7470
7471    // If BaseGV requires a register for the PIC base, we cannot also have a
7472    // BaseReg specified.
7473    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7474      return false;
7475
7476    // If lower 4G is not available, then we must use rip-relative addressing.
7477    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7478      return false;
7479  }
7480
7481  switch (AM.Scale) {
7482  case 0:
7483  case 1:
7484  case 2:
7485  case 4:
7486  case 8:
7487    // These scales always work.
7488    break;
7489  case 3:
7490  case 5:
7491  case 9:
7492    // These scales are formed with basereg+scalereg.  Only accept if there is
7493    // no basereg yet.
7494    if (AM.HasBaseReg)
7495      return false;
7496    break;
7497  default:  // Other stuff never works.
7498    return false;
7499  }
7500
7501  return true;
7502}
7503
7504
7505bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7506  if (!Ty1->isInteger() || !Ty2->isInteger())
7507    return false;
7508  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7509  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7510  if (NumBits1 <= NumBits2)
7511    return false;
7512  return Subtarget->is64Bit() || NumBits1 < 64;
7513}
7514
7515bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7516  if (!VT1.isInteger() || !VT2.isInteger())
7517    return false;
7518  unsigned NumBits1 = VT1.getSizeInBits();
7519  unsigned NumBits2 = VT2.getSizeInBits();
7520  if (NumBits1 <= NumBits2)
7521    return false;
7522  return Subtarget->is64Bit() || NumBits1 < 64;
7523}
7524
7525bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7526  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7527  return Ty1 == Type::getInt32Ty(Ty1->getContext()) &&
7528         Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit();
7529}
7530
7531bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7532  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7533  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7534}
7535
7536bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
7537  // i16 instructions are longer (0x66 prefix) and potentially slower.
7538  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7539}
7540
7541/// isShuffleMaskLegal - Targets can use this to indicate that they only
7542/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7543/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7544/// are assumed to be legal.
7545bool
7546X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7547                                      EVT VT) const {
7548  // Only do shuffles on 128-bit vector types for now.
7549  if (VT.getSizeInBits() == 64)
7550    return false;
7551
7552  // FIXME: pshufb, blends, shifts.
7553  return (VT.getVectorNumElements() == 2 ||
7554          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7555          isMOVLMask(M, VT) ||
7556          isSHUFPMask(M, VT) ||
7557          isPSHUFDMask(M, VT) ||
7558          isPSHUFHWMask(M, VT) ||
7559          isPSHUFLWMask(M, VT) ||
7560          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
7561          isUNPCKLMask(M, VT) ||
7562          isUNPCKHMask(M, VT) ||
7563          isUNPCKL_v_undef_Mask(M, VT) ||
7564          isUNPCKH_v_undef_Mask(M, VT));
7565}
7566
7567bool
7568X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7569                                          EVT VT) const {
7570  unsigned NumElts = VT.getVectorNumElements();
7571  // FIXME: This collection of masks seems suspect.
7572  if (NumElts == 2)
7573    return true;
7574  if (NumElts == 4 && VT.getSizeInBits() == 128) {
7575    return (isMOVLMask(Mask, VT)  ||
7576            isCommutedMOVLMask(Mask, VT, true) ||
7577            isSHUFPMask(Mask, VT) ||
7578            isCommutedSHUFPMask(Mask, VT));
7579  }
7580  return false;
7581}
7582
7583//===----------------------------------------------------------------------===//
7584//                           X86 Scheduler Hooks
7585//===----------------------------------------------------------------------===//
7586
7587// private utility function
7588MachineBasicBlock *
7589X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7590                                                       MachineBasicBlock *MBB,
7591                                                       unsigned regOpc,
7592                                                       unsigned immOpc,
7593                                                       unsigned LoadOpc,
7594                                                       unsigned CXchgOpc,
7595                                                       unsigned copyOpc,
7596                                                       unsigned notOpc,
7597                                                       unsigned EAXreg,
7598                                                       TargetRegisterClass *RC,
7599                                                       bool invSrc) const {
7600  // For the atomic bitwise operator, we generate
7601  //   thisMBB:
7602  //   newMBB:
7603  //     ld  t1 = [bitinstr.addr]
7604  //     op  t2 = t1, [bitinstr.val]
7605  //     mov EAX = t1
7606  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
7607  //     bz  newMBB
7608  //     fallthrough -->nextMBB
7609  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7610  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7611  MachineFunction::iterator MBBIter = MBB;
7612  ++MBBIter;
7613
7614  /// First build the CFG
7615  MachineFunction *F = MBB->getParent();
7616  MachineBasicBlock *thisMBB = MBB;
7617  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7618  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7619  F->insert(MBBIter, newMBB);
7620  F->insert(MBBIter, nextMBB);
7621
7622  // Move all successors to thisMBB to nextMBB
7623  nextMBB->transferSuccessors(thisMBB);
7624
7625  // Update thisMBB to fall through to newMBB
7626  thisMBB->addSuccessor(newMBB);
7627
7628  // newMBB jumps to itself and fall through to nextMBB
7629  newMBB->addSuccessor(nextMBB);
7630  newMBB->addSuccessor(newMBB);
7631
7632  // Insert instructions into newMBB based on incoming instruction
7633  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7634         "unexpected number of operands");
7635  DebugLoc dl = bInstr->getDebugLoc();
7636  MachineOperand& destOper = bInstr->getOperand(0);
7637  MachineOperand* argOpers[2 + X86AddrNumOperands];
7638  int numArgs = bInstr->getNumOperands() - 1;
7639  for (int i=0; i < numArgs; ++i)
7640    argOpers[i] = &bInstr->getOperand(i+1);
7641
7642  // x86 address has 4 operands: base, index, scale, and displacement
7643  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7644  int valArgIndx = lastAddrIndx + 1;
7645
7646  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7647  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
7648  for (int i=0; i <= lastAddrIndx; ++i)
7649    (*MIB).addOperand(*argOpers[i]);
7650
7651  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
7652  if (invSrc) {
7653    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
7654  }
7655  else
7656    tt = t1;
7657
7658  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7659  assert((argOpers[valArgIndx]->isReg() ||
7660          argOpers[valArgIndx]->isImm()) &&
7661         "invalid operand");
7662  if (argOpers[valArgIndx]->isReg())
7663    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
7664  else
7665    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
7666  MIB.addReg(tt);
7667  (*MIB).addOperand(*argOpers[valArgIndx]);
7668
7669  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
7670  MIB.addReg(t1);
7671
7672  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
7673  for (int i=0; i <= lastAddrIndx; ++i)
7674    (*MIB).addOperand(*argOpers[i]);
7675  MIB.addReg(t2);
7676  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7677  (*MIB).setMemRefs(bInstr->memoperands_begin(),
7678                    bInstr->memoperands_end());
7679
7680  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
7681  MIB.addReg(EAXreg);
7682
7683  // insert branch
7684  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7685
7686  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
7687  return nextMBB;
7688}
7689
7690// private utility function:  64 bit atomics on 32 bit host.
7691MachineBasicBlock *
7692X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
7693                                                       MachineBasicBlock *MBB,
7694                                                       unsigned regOpcL,
7695                                                       unsigned regOpcH,
7696                                                       unsigned immOpcL,
7697                                                       unsigned immOpcH,
7698                                                       bool invSrc) const {
7699  // For the atomic bitwise operator, we generate
7700  //   thisMBB (instructions are in pairs, except cmpxchg8b)
7701  //     ld t1,t2 = [bitinstr.addr]
7702  //   newMBB:
7703  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
7704  //     op  t5, t6 <- out1, out2, [bitinstr.val]
7705  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
7706  //     mov ECX, EBX <- t5, t6
7707  //     mov EAX, EDX <- t1, t2
7708  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
7709  //     mov t3, t4 <- EAX, EDX
7710  //     bz  newMBB
7711  //     result in out1, out2
7712  //     fallthrough -->nextMBB
7713
7714  const TargetRegisterClass *RC = X86::GR32RegisterClass;
7715  const unsigned LoadOpc = X86::MOV32rm;
7716  const unsigned copyOpc = X86::MOV32rr;
7717  const unsigned NotOpc = X86::NOT32r;
7718  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7719  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7720  MachineFunction::iterator MBBIter = MBB;
7721  ++MBBIter;
7722
7723  /// First build the CFG
7724  MachineFunction *F = MBB->getParent();
7725  MachineBasicBlock *thisMBB = MBB;
7726  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7727  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7728  F->insert(MBBIter, newMBB);
7729  F->insert(MBBIter, nextMBB);
7730
7731  // Move all successors to thisMBB to nextMBB
7732  nextMBB->transferSuccessors(thisMBB);
7733
7734  // Update thisMBB to fall through to newMBB
7735  thisMBB->addSuccessor(newMBB);
7736
7737  // newMBB jumps to itself and fall through to nextMBB
7738  newMBB->addSuccessor(nextMBB);
7739  newMBB->addSuccessor(newMBB);
7740
7741  DebugLoc dl = bInstr->getDebugLoc();
7742  // Insert instructions into newMBB based on incoming instruction
7743  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
7744  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
7745         "unexpected number of operands");
7746  MachineOperand& dest1Oper = bInstr->getOperand(0);
7747  MachineOperand& dest2Oper = bInstr->getOperand(1);
7748  MachineOperand* argOpers[2 + X86AddrNumOperands];
7749  for (int i=0; i < 2 + X86AddrNumOperands; ++i)
7750    argOpers[i] = &bInstr->getOperand(i+2);
7751
7752  // x86 address has 4 operands: base, index, scale, and displacement
7753  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7754
7755  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7756  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
7757  for (int i=0; i <= lastAddrIndx; ++i)
7758    (*MIB).addOperand(*argOpers[i]);
7759  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7760  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
7761  // add 4 to displacement.
7762  for (int i=0; i <= lastAddrIndx-2; ++i)
7763    (*MIB).addOperand(*argOpers[i]);
7764  MachineOperand newOp3 = *(argOpers[3]);
7765  if (newOp3.isImm())
7766    newOp3.setImm(newOp3.getImm()+4);
7767  else
7768    newOp3.setOffset(newOp3.getOffset()+4);
7769  (*MIB).addOperand(newOp3);
7770  (*MIB).addOperand(*argOpers[lastAddrIndx]);
7771
7772  // t3/4 are defined later, at the bottom of the loop
7773  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
7774  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
7775  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
7776    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
7777  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
7778    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
7779
7780  unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
7781  unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
7782  if (invSrc) {
7783    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1);
7784    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2);
7785  } else {
7786    tt1 = t1;
7787    tt2 = t2;
7788  }
7789
7790  int valArgIndx = lastAddrIndx + 1;
7791  assert((argOpers[valArgIndx]->isReg() ||
7792          argOpers[valArgIndx]->isImm()) &&
7793         "invalid operand");
7794  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
7795  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
7796  if (argOpers[valArgIndx]->isReg())
7797    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
7798  else
7799    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
7800  if (regOpcL != X86::MOV32rr)
7801    MIB.addReg(tt1);
7802  (*MIB).addOperand(*argOpers[valArgIndx]);
7803  assert(argOpers[valArgIndx + 1]->isReg() ==
7804         argOpers[valArgIndx]->isReg());
7805  assert(argOpers[valArgIndx + 1]->isImm() ==
7806         argOpers[valArgIndx]->isImm());
7807  if (argOpers[valArgIndx + 1]->isReg())
7808    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
7809  else
7810    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
7811  if (regOpcH != X86::MOV32rr)
7812    MIB.addReg(tt2);
7813  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
7814
7815  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
7816  MIB.addReg(t1);
7817  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
7818  MIB.addReg(t2);
7819
7820  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
7821  MIB.addReg(t5);
7822  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
7823  MIB.addReg(t6);
7824
7825  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
7826  for (int i=0; i <= lastAddrIndx; ++i)
7827    (*MIB).addOperand(*argOpers[i]);
7828
7829  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7830  (*MIB).setMemRefs(bInstr->memoperands_begin(),
7831                    bInstr->memoperands_end());
7832
7833  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
7834  MIB.addReg(X86::EAX);
7835  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
7836  MIB.addReg(X86::EDX);
7837
7838  // insert branch
7839  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7840
7841  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
7842  return nextMBB;
7843}
7844
7845// private utility function
7846MachineBasicBlock *
7847X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
7848                                                      MachineBasicBlock *MBB,
7849                                                      unsigned cmovOpc) const {
7850  // For the atomic min/max operator, we generate
7851  //   thisMBB:
7852  //   newMBB:
7853  //     ld t1 = [min/max.addr]
7854  //     mov t2 = [min/max.val]
7855  //     cmp  t1, t2
7856  //     cmov[cond] t2 = t1
7857  //     mov EAX = t1
7858  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
7859  //     bz   newMBB
7860  //     fallthrough -->nextMBB
7861  //
7862  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7863  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7864  MachineFunction::iterator MBBIter = MBB;
7865  ++MBBIter;
7866
7867  /// First build the CFG
7868  MachineFunction *F = MBB->getParent();
7869  MachineBasicBlock *thisMBB = MBB;
7870  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7871  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7872  F->insert(MBBIter, newMBB);
7873  F->insert(MBBIter, nextMBB);
7874
7875  // Move all successors of thisMBB to nextMBB
7876  nextMBB->transferSuccessors(thisMBB);
7877
7878  // Update thisMBB to fall through to newMBB
7879  thisMBB->addSuccessor(newMBB);
7880
7881  // newMBB jumps to newMBB and fall through to nextMBB
7882  newMBB->addSuccessor(nextMBB);
7883  newMBB->addSuccessor(newMBB);
7884
7885  DebugLoc dl = mInstr->getDebugLoc();
7886  // Insert instructions into newMBB based on incoming instruction
7887  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7888         "unexpected number of operands");
7889  MachineOperand& destOper = mInstr->getOperand(0);
7890  MachineOperand* argOpers[2 + X86AddrNumOperands];
7891  int numArgs = mInstr->getNumOperands() - 1;
7892  for (int i=0; i < numArgs; ++i)
7893    argOpers[i] = &mInstr->getOperand(i+1);
7894
7895  // x86 address has 4 operands: base, index, scale, and displacement
7896  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7897  int valArgIndx = lastAddrIndx + 1;
7898
7899  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7900  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
7901  for (int i=0; i <= lastAddrIndx; ++i)
7902    (*MIB).addOperand(*argOpers[i]);
7903
7904  // We only support register and immediate values
7905  assert((argOpers[valArgIndx]->isReg() ||
7906          argOpers[valArgIndx]->isImm()) &&
7907         "invalid operand");
7908
7909  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7910  if (argOpers[valArgIndx]->isReg())
7911    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7912  else
7913    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7914  (*MIB).addOperand(*argOpers[valArgIndx]);
7915
7916  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
7917  MIB.addReg(t1);
7918
7919  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
7920  MIB.addReg(t1);
7921  MIB.addReg(t2);
7922
7923  // Generate movc
7924  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7925  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
7926  MIB.addReg(t2);
7927  MIB.addReg(t1);
7928
7929  // Cmp and exchange if none has modified the memory location
7930  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
7931  for (int i=0; i <= lastAddrIndx; ++i)
7932    (*MIB).addOperand(*argOpers[i]);
7933  MIB.addReg(t3);
7934  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7935  (*MIB).setMemRefs(mInstr->memoperands_begin(),
7936                    mInstr->memoperands_end());
7937
7938  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
7939  MIB.addReg(X86::EAX);
7940
7941  // insert branch
7942  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7943
7944  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
7945  return nextMBB;
7946}
7947
7948// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
7949// all of this code can be replaced with that in the .td file.
7950MachineBasicBlock *
7951X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
7952                            unsigned numArgs, bool memArg) const {
7953
7954  MachineFunction *F = BB->getParent();
7955  DebugLoc dl = MI->getDebugLoc();
7956  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7957
7958  unsigned Opc;
7959  if (memArg)
7960    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
7961  else
7962    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
7963
7964  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
7965
7966  for (unsigned i = 0; i < numArgs; ++i) {
7967    MachineOperand &Op = MI->getOperand(i+1);
7968
7969    if (!(Op.isReg() && Op.isImplicit()))
7970      MIB.addOperand(Op);
7971  }
7972
7973  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
7974    .addReg(X86::XMM0);
7975
7976  F->DeleteMachineInstr(MI);
7977
7978  return BB;
7979}
7980
7981MachineBasicBlock *
7982X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
7983                                                 MachineInstr *MI,
7984                                                 MachineBasicBlock *MBB) const {
7985  // Emit code to save XMM registers to the stack. The ABI says that the
7986  // number of registers to save is given in %al, so it's theoretically
7987  // possible to do an indirect jump trick to avoid saving all of them,
7988  // however this code takes a simpler approach and just executes all
7989  // of the stores if %al is non-zero. It's less code, and it's probably
7990  // easier on the hardware branch predictor, and stores aren't all that
7991  // expensive anyway.
7992
7993  // Create the new basic blocks. One block contains all the XMM stores,
7994  // and one block is the final destination regardless of whether any
7995  // stores were performed.
7996  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7997  MachineFunction *F = MBB->getParent();
7998  MachineFunction::iterator MBBIter = MBB;
7999  ++MBBIter;
8000  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
8001  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
8002  F->insert(MBBIter, XMMSaveMBB);
8003  F->insert(MBBIter, EndMBB);
8004
8005  // Set up the CFG.
8006  // Move any original successors of MBB to the end block.
8007  EndMBB->transferSuccessors(MBB);
8008  // The original block will now fall through to the XMM save block.
8009  MBB->addSuccessor(XMMSaveMBB);
8010  // The XMMSaveMBB will fall through to the end block.
8011  XMMSaveMBB->addSuccessor(EndMBB);
8012
8013  // Now add the instructions.
8014  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8015  DebugLoc DL = MI->getDebugLoc();
8016
8017  unsigned CountReg = MI->getOperand(0).getReg();
8018  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
8019  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
8020
8021  if (!Subtarget->isTargetWin64()) {
8022    // If %al is 0, branch around the XMM save block.
8023    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
8024    BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB);
8025    MBB->addSuccessor(EndMBB);
8026  }
8027
8028  // In the XMM save block, save all the XMM argument registers.
8029  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
8030    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
8031    MachineMemOperand *MMO =
8032      F->getMachineMemOperand(
8033        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
8034        MachineMemOperand::MOStore, Offset,
8035        /*Size=*/16, /*Align=*/16);
8036    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
8037      .addFrameIndex(RegSaveFrameIndex)
8038      .addImm(/*Scale=*/1)
8039      .addReg(/*IndexReg=*/0)
8040      .addImm(/*Disp=*/Offset)
8041      .addReg(/*Segment=*/0)
8042      .addReg(MI->getOperand(i).getReg())
8043      .addMemOperand(MMO);
8044  }
8045
8046  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8047
8048  return EndMBB;
8049}
8050
8051MachineBasicBlock *
8052X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
8053                                     MachineBasicBlock *BB,
8054                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8055  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8056  DebugLoc DL = MI->getDebugLoc();
8057
8058  // To "insert" a SELECT_CC instruction, we actually have to insert the
8059  // diamond control-flow pattern.  The incoming instruction knows the
8060  // destination vreg to set, the condition code register to branch on, the
8061  // true/false values to select between, and a branch opcode to use.
8062  const BasicBlock *LLVM_BB = BB->getBasicBlock();
8063  MachineFunction::iterator It = BB;
8064  ++It;
8065
8066  //  thisMBB:
8067  //  ...
8068  //   TrueVal = ...
8069  //   cmpTY ccX, r1, r2
8070  //   bCC copy1MBB
8071  //   fallthrough --> copy0MBB
8072  MachineBasicBlock *thisMBB = BB;
8073  MachineFunction *F = BB->getParent();
8074  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8075  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
8076  unsigned Opc =
8077    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
8078  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
8079  F->insert(It, copy0MBB);
8080  F->insert(It, sinkMBB);
8081  // Update machine-CFG edges by first adding all successors of the current
8082  // block to the new block which will contain the Phi node for the select.
8083  // Also inform sdisel of the edge changes.
8084  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
8085         E = BB->succ_end(); I != E; ++I) {
8086    EM->insert(std::make_pair(*I, sinkMBB));
8087    sinkMBB->addSuccessor(*I);
8088  }
8089  // Next, remove all successors of the current block, and add the true
8090  // and fallthrough blocks as its successors.
8091  while (!BB->succ_empty())
8092    BB->removeSuccessor(BB->succ_begin());
8093  // Add the true and fallthrough blocks as its successors.
8094  BB->addSuccessor(copy0MBB);
8095  BB->addSuccessor(sinkMBB);
8096
8097  //  copy0MBB:
8098  //   %FalseValue = ...
8099  //   # fallthrough to sinkMBB
8100  BB = copy0MBB;
8101
8102  // Update machine-CFG edges
8103  BB->addSuccessor(sinkMBB);
8104
8105  //  sinkMBB:
8106  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8107  //  ...
8108  BB = sinkMBB;
8109  BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
8110    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
8111    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
8112
8113  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8114  return BB;
8115}
8116
8117
8118MachineBasicBlock *
8119X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
8120                                               MachineBasicBlock *BB,
8121                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8122  switch (MI->getOpcode()) {
8123  default: assert(false && "Unexpected instr type to insert");
8124  case X86::CMOV_GR8:
8125  case X86::CMOV_V1I64:
8126  case X86::CMOV_FR32:
8127  case X86::CMOV_FR64:
8128  case X86::CMOV_V4F32:
8129  case X86::CMOV_V2F64:
8130  case X86::CMOV_V2I64:
8131    return EmitLoweredSelect(MI, BB, EM);
8132
8133  case X86::FP32_TO_INT16_IN_MEM:
8134  case X86::FP32_TO_INT32_IN_MEM:
8135  case X86::FP32_TO_INT64_IN_MEM:
8136  case X86::FP64_TO_INT16_IN_MEM:
8137  case X86::FP64_TO_INT32_IN_MEM:
8138  case X86::FP64_TO_INT64_IN_MEM:
8139  case X86::FP80_TO_INT16_IN_MEM:
8140  case X86::FP80_TO_INT32_IN_MEM:
8141  case X86::FP80_TO_INT64_IN_MEM: {
8142    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8143    DebugLoc DL = MI->getDebugLoc();
8144
8145    // Change the floating point control register to use "round towards zero"
8146    // mode when truncating to an integer value.
8147    MachineFunction *F = BB->getParent();
8148    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
8149    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
8150
8151    // Load the old value of the high byte of the control word...
8152    unsigned OldCW =
8153      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
8154    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
8155                      CWFrameIdx);
8156
8157    // Set the high part to be round to zero...
8158    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
8159      .addImm(0xC7F);
8160
8161    // Reload the modified control word now...
8162    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8163
8164    // Restore the memory image of control word to original value
8165    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
8166      .addReg(OldCW);
8167
8168    // Get the X86 opcode to use.
8169    unsigned Opc;
8170    switch (MI->getOpcode()) {
8171    default: llvm_unreachable("illegal opcode!");
8172    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
8173    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
8174    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
8175    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
8176    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
8177    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
8178    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
8179    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
8180    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
8181    }
8182
8183    X86AddressMode AM;
8184    MachineOperand &Op = MI->getOperand(0);
8185    if (Op.isReg()) {
8186      AM.BaseType = X86AddressMode::RegBase;
8187      AM.Base.Reg = Op.getReg();
8188    } else {
8189      AM.BaseType = X86AddressMode::FrameIndexBase;
8190      AM.Base.FrameIndex = Op.getIndex();
8191    }
8192    Op = MI->getOperand(1);
8193    if (Op.isImm())
8194      AM.Scale = Op.getImm();
8195    Op = MI->getOperand(2);
8196    if (Op.isImm())
8197      AM.IndexReg = Op.getImm();
8198    Op = MI->getOperand(3);
8199    if (Op.isGlobal()) {
8200      AM.GV = Op.getGlobal();
8201    } else {
8202      AM.Disp = Op.getImm();
8203    }
8204    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
8205                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
8206
8207    // Reload the original control word now.
8208    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8209
8210    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8211    return BB;
8212  }
8213    // String/text processing lowering.
8214  case X86::PCMPISTRM128REG:
8215    return EmitPCMP(MI, BB, 3, false /* in-mem */);
8216  case X86::PCMPISTRM128MEM:
8217    return EmitPCMP(MI, BB, 3, true /* in-mem */);
8218  case X86::PCMPESTRM128REG:
8219    return EmitPCMP(MI, BB, 5, false /* in mem */);
8220  case X86::PCMPESTRM128MEM:
8221    return EmitPCMP(MI, BB, 5, true /* in mem */);
8222
8223    // Atomic Lowering.
8224  case X86::ATOMAND32:
8225    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8226                                               X86::AND32ri, X86::MOV32rm,
8227                                               X86::LCMPXCHG32, X86::MOV32rr,
8228                                               X86::NOT32r, X86::EAX,
8229                                               X86::GR32RegisterClass);
8230  case X86::ATOMOR32:
8231    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
8232                                               X86::OR32ri, X86::MOV32rm,
8233                                               X86::LCMPXCHG32, X86::MOV32rr,
8234                                               X86::NOT32r, X86::EAX,
8235                                               X86::GR32RegisterClass);
8236  case X86::ATOMXOR32:
8237    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
8238                                               X86::XOR32ri, X86::MOV32rm,
8239                                               X86::LCMPXCHG32, X86::MOV32rr,
8240                                               X86::NOT32r, X86::EAX,
8241                                               X86::GR32RegisterClass);
8242  case X86::ATOMNAND32:
8243    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8244                                               X86::AND32ri, X86::MOV32rm,
8245                                               X86::LCMPXCHG32, X86::MOV32rr,
8246                                               X86::NOT32r, X86::EAX,
8247                                               X86::GR32RegisterClass, true);
8248  case X86::ATOMMIN32:
8249    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
8250  case X86::ATOMMAX32:
8251    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
8252  case X86::ATOMUMIN32:
8253    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
8254  case X86::ATOMUMAX32:
8255    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
8256
8257  case X86::ATOMAND16:
8258    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8259                                               X86::AND16ri, X86::MOV16rm,
8260                                               X86::LCMPXCHG16, X86::MOV16rr,
8261                                               X86::NOT16r, X86::AX,
8262                                               X86::GR16RegisterClass);
8263  case X86::ATOMOR16:
8264    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
8265                                               X86::OR16ri, X86::MOV16rm,
8266                                               X86::LCMPXCHG16, X86::MOV16rr,
8267                                               X86::NOT16r, X86::AX,
8268                                               X86::GR16RegisterClass);
8269  case X86::ATOMXOR16:
8270    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
8271                                               X86::XOR16ri, X86::MOV16rm,
8272                                               X86::LCMPXCHG16, X86::MOV16rr,
8273                                               X86::NOT16r, X86::AX,
8274                                               X86::GR16RegisterClass);
8275  case X86::ATOMNAND16:
8276    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8277                                               X86::AND16ri, X86::MOV16rm,
8278                                               X86::LCMPXCHG16, X86::MOV16rr,
8279                                               X86::NOT16r, X86::AX,
8280                                               X86::GR16RegisterClass, true);
8281  case X86::ATOMMIN16:
8282    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
8283  case X86::ATOMMAX16:
8284    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
8285  case X86::ATOMUMIN16:
8286    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
8287  case X86::ATOMUMAX16:
8288    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
8289
8290  case X86::ATOMAND8:
8291    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8292                                               X86::AND8ri, X86::MOV8rm,
8293                                               X86::LCMPXCHG8, X86::MOV8rr,
8294                                               X86::NOT8r, X86::AL,
8295                                               X86::GR8RegisterClass);
8296  case X86::ATOMOR8:
8297    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
8298                                               X86::OR8ri, X86::MOV8rm,
8299                                               X86::LCMPXCHG8, X86::MOV8rr,
8300                                               X86::NOT8r, X86::AL,
8301                                               X86::GR8RegisterClass);
8302  case X86::ATOMXOR8:
8303    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
8304                                               X86::XOR8ri, X86::MOV8rm,
8305                                               X86::LCMPXCHG8, X86::MOV8rr,
8306                                               X86::NOT8r, X86::AL,
8307                                               X86::GR8RegisterClass);
8308  case X86::ATOMNAND8:
8309    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8310                                               X86::AND8ri, X86::MOV8rm,
8311                                               X86::LCMPXCHG8, X86::MOV8rr,
8312                                               X86::NOT8r, X86::AL,
8313                                               X86::GR8RegisterClass, true);
8314  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
8315  // This group is for 64-bit host.
8316  case X86::ATOMAND64:
8317    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8318                                               X86::AND64ri32, X86::MOV64rm,
8319                                               X86::LCMPXCHG64, X86::MOV64rr,
8320                                               X86::NOT64r, X86::RAX,
8321                                               X86::GR64RegisterClass);
8322  case X86::ATOMOR64:
8323    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
8324                                               X86::OR64ri32, X86::MOV64rm,
8325                                               X86::LCMPXCHG64, X86::MOV64rr,
8326                                               X86::NOT64r, X86::RAX,
8327                                               X86::GR64RegisterClass);
8328  case X86::ATOMXOR64:
8329    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
8330                                               X86::XOR64ri32, X86::MOV64rm,
8331                                               X86::LCMPXCHG64, X86::MOV64rr,
8332                                               X86::NOT64r, X86::RAX,
8333                                               X86::GR64RegisterClass);
8334  case X86::ATOMNAND64:
8335    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8336                                               X86::AND64ri32, X86::MOV64rm,
8337                                               X86::LCMPXCHG64, X86::MOV64rr,
8338                                               X86::NOT64r, X86::RAX,
8339                                               X86::GR64RegisterClass, true);
8340  case X86::ATOMMIN64:
8341    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
8342  case X86::ATOMMAX64:
8343    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
8344  case X86::ATOMUMIN64:
8345    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
8346  case X86::ATOMUMAX64:
8347    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
8348
8349  // This group does 64-bit operations on a 32-bit host.
8350  case X86::ATOMAND6432:
8351    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8352                                               X86::AND32rr, X86::AND32rr,
8353                                               X86::AND32ri, X86::AND32ri,
8354                                               false);
8355  case X86::ATOMOR6432:
8356    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8357                                               X86::OR32rr, X86::OR32rr,
8358                                               X86::OR32ri, X86::OR32ri,
8359                                               false);
8360  case X86::ATOMXOR6432:
8361    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8362                                               X86::XOR32rr, X86::XOR32rr,
8363                                               X86::XOR32ri, X86::XOR32ri,
8364                                               false);
8365  case X86::ATOMNAND6432:
8366    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8367                                               X86::AND32rr, X86::AND32rr,
8368                                               X86::AND32ri, X86::AND32ri,
8369                                               true);
8370  case X86::ATOMADD6432:
8371    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8372                                               X86::ADD32rr, X86::ADC32rr,
8373                                               X86::ADD32ri, X86::ADC32ri,
8374                                               false);
8375  case X86::ATOMSUB6432:
8376    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8377                                               X86::SUB32rr, X86::SBB32rr,
8378                                               X86::SUB32ri, X86::SBB32ri,
8379                                               false);
8380  case X86::ATOMSWAP6432:
8381    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8382                                               X86::MOV32rr, X86::MOV32rr,
8383                                               X86::MOV32ri, X86::MOV32ri,
8384                                               false);
8385  case X86::VASTART_SAVE_XMM_REGS:
8386    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
8387  }
8388}
8389
8390//===----------------------------------------------------------------------===//
8391//                           X86 Optimization Hooks
8392//===----------------------------------------------------------------------===//
8393
8394void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
8395                                                       const APInt &Mask,
8396                                                       APInt &KnownZero,
8397                                                       APInt &KnownOne,
8398                                                       const SelectionDAG &DAG,
8399                                                       unsigned Depth) const {
8400  unsigned Opc = Op.getOpcode();
8401  assert((Opc >= ISD::BUILTIN_OP_END ||
8402          Opc == ISD::INTRINSIC_WO_CHAIN ||
8403          Opc == ISD::INTRINSIC_W_CHAIN ||
8404          Opc == ISD::INTRINSIC_VOID) &&
8405         "Should use MaskedValueIsZero if you don't know whether Op"
8406         " is a target node!");
8407
8408  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
8409  switch (Opc) {
8410  default: break;
8411  case X86ISD::ADD:
8412  case X86ISD::SUB:
8413  case X86ISD::SMUL:
8414  case X86ISD::UMUL:
8415  case X86ISD::INC:
8416  case X86ISD::DEC:
8417  case X86ISD::OR:
8418  case X86ISD::XOR:
8419  case X86ISD::AND:
8420    // These nodes' second result is a boolean.
8421    if (Op.getResNo() == 0)
8422      break;
8423    // Fallthrough
8424  case X86ISD::SETCC:
8425    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
8426                                       Mask.getBitWidth() - 1);
8427    break;
8428  }
8429}
8430
8431/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
8432/// node is a GlobalAddress + offset.
8433bool X86TargetLowering::isGAPlusOffset(SDNode *N,
8434                                       GlobalValue* &GA, int64_t &Offset) const{
8435  if (N->getOpcode() == X86ISD::Wrapper) {
8436    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
8437      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
8438      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
8439      return true;
8440    }
8441  }
8442  return TargetLowering::isGAPlusOffset(N, GA, Offset);
8443}
8444
8445static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
8446                                     EVT EltVT, LoadSDNode *&LDBase,
8447                                     unsigned &LastLoadedElt,
8448                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
8449                                     const TargetLowering &TLI) {
8450  LDBase = NULL;
8451  LastLoadedElt = -1U;
8452  for (unsigned i = 0; i < NumElems; ++i) {
8453    if (N->getMaskElt(i) < 0) {
8454      if (!LDBase)
8455        return false;
8456      continue;
8457    }
8458
8459    SDValue Elt = DAG.getShuffleScalarElt(N, i);
8460    if (!Elt.getNode() ||
8461        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
8462      return false;
8463    if (!LDBase) {
8464      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
8465        return false;
8466      LDBase = cast<LoadSDNode>(Elt.getNode());
8467      LastLoadedElt = i;
8468      continue;
8469    }
8470    if (Elt.getOpcode() == ISD::UNDEF)
8471      continue;
8472
8473    LoadSDNode *LD = cast<LoadSDNode>(Elt);
8474    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
8475      return false;
8476    LastLoadedElt = i;
8477  }
8478  return true;
8479}
8480
8481/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
8482/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
8483/// if the load addresses are consecutive, non-overlapping, and in the right
8484/// order.  In the case of v2i64, it will see if it can rewrite the
8485/// shuffle to be an appropriate build vector so it can take advantage of
8486// performBuildVectorCombine.
8487static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
8488                                     const TargetLowering &TLI) {
8489  DebugLoc dl = N->getDebugLoc();
8490  EVT VT = N->getValueType(0);
8491  EVT EltVT = VT.getVectorElementType();
8492  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
8493  unsigned NumElems = VT.getVectorNumElements();
8494
8495  if (VT.getSizeInBits() != 128)
8496    return SDValue();
8497
8498  // Try to combine a vector_shuffle into a 128-bit load.
8499  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
8500  LoadSDNode *LD = NULL;
8501  unsigned LastLoadedElt;
8502  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
8503                                MFI, TLI))
8504    return SDValue();
8505
8506  if (LastLoadedElt == NumElems - 1) {
8507    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
8508      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8509                         LD->getSrcValue(), LD->getSrcValueOffset(),
8510                         LD->isVolatile());
8511    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8512                       LD->getSrcValue(), LD->getSrcValueOffset(),
8513                       LD->isVolatile(), LD->getAlignment());
8514  } else if (NumElems == 4 && LastLoadedElt == 1) {
8515    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
8516    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
8517    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
8518    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
8519  }
8520  return SDValue();
8521}
8522
8523/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
8524static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
8525                                    const X86Subtarget *Subtarget) {
8526  DebugLoc DL = N->getDebugLoc();
8527  SDValue Cond = N->getOperand(0);
8528  // Get the LHS/RHS of the select.
8529  SDValue LHS = N->getOperand(1);
8530  SDValue RHS = N->getOperand(2);
8531
8532  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
8533  // instructions have the peculiarity that if either operand is a NaN,
8534  // they chose what we call the RHS operand (and as such are not symmetric).
8535  // It happens that this matches the semantics of the common C idiom
8536  // x<y?x:y and related forms, so we can recognize these cases.
8537  if (Subtarget->hasSSE2() &&
8538      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
8539      Cond.getOpcode() == ISD::SETCC) {
8540    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
8541
8542    unsigned Opcode = 0;
8543    // Check for x CC y ? x : y.
8544    if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
8545      switch (CC) {
8546      default: break;
8547      case ISD::SETULT:
8548        // This can be a min if we can prove that at least one of the operands
8549        // is not a nan.
8550        if (!FiniteOnlyFPMath()) {
8551          if (DAG.isKnownNeverNaN(RHS)) {
8552            // Put the potential NaN in the RHS so that SSE will preserve it.
8553            std::swap(LHS, RHS);
8554          } else if (!DAG.isKnownNeverNaN(LHS))
8555            break;
8556        }
8557        Opcode = X86ISD::FMIN;
8558        break;
8559      case ISD::SETOLE:
8560        // This can be a min if we can prove that at least one of the operands
8561        // is not a nan.
8562        if (!FiniteOnlyFPMath()) {
8563          if (DAG.isKnownNeverNaN(LHS)) {
8564            // Put the potential NaN in the RHS so that SSE will preserve it.
8565            std::swap(LHS, RHS);
8566          } else if (!DAG.isKnownNeverNaN(RHS))
8567            break;
8568        }
8569        Opcode = X86ISD::FMIN;
8570        break;
8571      case ISD::SETULE:
8572        // This can be a min, but if either operand is a NaN we need it to
8573        // preserve the original LHS.
8574        std::swap(LHS, RHS);
8575      case ISD::SETOLT:
8576      case ISD::SETLT:
8577      case ISD::SETLE:
8578        Opcode = X86ISD::FMIN;
8579        break;
8580
8581      case ISD::SETOGE:
8582        // This can be a max if we can prove that at least one of the operands
8583        // is not a nan.
8584        if (!FiniteOnlyFPMath()) {
8585          if (DAG.isKnownNeverNaN(LHS)) {
8586            // Put the potential NaN in the RHS so that SSE will preserve it.
8587            std::swap(LHS, RHS);
8588          } else if (!DAG.isKnownNeverNaN(RHS))
8589            break;
8590        }
8591        Opcode = X86ISD::FMAX;
8592        break;
8593      case ISD::SETUGT:
8594        // This can be a max if we can prove that at least one of the operands
8595        // is not a nan.
8596        if (!FiniteOnlyFPMath()) {
8597          if (DAG.isKnownNeverNaN(RHS)) {
8598            // Put the potential NaN in the RHS so that SSE will preserve it.
8599            std::swap(LHS, RHS);
8600          } else if (!DAG.isKnownNeverNaN(LHS))
8601            break;
8602        }
8603        Opcode = X86ISD::FMAX;
8604        break;
8605      case ISD::SETUGE:
8606        // This can be a max, but if either operand is a NaN we need it to
8607        // preserve the original LHS.
8608        std::swap(LHS, RHS);
8609      case ISD::SETOGT:
8610      case ISD::SETGT:
8611      case ISD::SETGE:
8612        Opcode = X86ISD::FMAX;
8613        break;
8614      }
8615    // Check for x CC y ? y : x -- a min/max with reversed arms.
8616    } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
8617      switch (CC) {
8618      default: break;
8619      case ISD::SETOGE:
8620        // This can be a min if we can prove that at least one of the operands
8621        // is not a nan.
8622        if (!FiniteOnlyFPMath()) {
8623          if (DAG.isKnownNeverNaN(RHS)) {
8624            // Put the potential NaN in the RHS so that SSE will preserve it.
8625            std::swap(LHS, RHS);
8626          } else if (!DAG.isKnownNeverNaN(LHS))
8627            break;
8628        }
8629        Opcode = X86ISD::FMIN;
8630        break;
8631      case ISD::SETUGT:
8632        // This can be a min if we can prove that at least one of the operands
8633        // is not a nan.
8634        if (!FiniteOnlyFPMath()) {
8635          if (DAG.isKnownNeverNaN(LHS)) {
8636            // Put the potential NaN in the RHS so that SSE will preserve it.
8637            std::swap(LHS, RHS);
8638          } else if (!DAG.isKnownNeverNaN(RHS))
8639            break;
8640        }
8641        Opcode = X86ISD::FMIN;
8642        break;
8643      case ISD::SETUGE:
8644        // This can be a min, but if either operand is a NaN we need it to
8645        // preserve the original LHS.
8646        std::swap(LHS, RHS);
8647      case ISD::SETOGT:
8648      case ISD::SETGT:
8649      case ISD::SETGE:
8650        Opcode = X86ISD::FMIN;
8651        break;
8652
8653      case ISD::SETULT:
8654        // This can be a max if we can prove that at least one of the operands
8655        // is not a nan.
8656        if (!FiniteOnlyFPMath()) {
8657          if (DAG.isKnownNeverNaN(LHS)) {
8658            // Put the potential NaN in the RHS so that SSE will preserve it.
8659            std::swap(LHS, RHS);
8660          } else if (!DAG.isKnownNeverNaN(RHS))
8661            break;
8662        }
8663        Opcode = X86ISD::FMAX;
8664        break;
8665      case ISD::SETOLE:
8666        // This can be a max if we can prove that at least one of the operands
8667        // is not a nan.
8668        if (!FiniteOnlyFPMath()) {
8669          if (DAG.isKnownNeverNaN(RHS)) {
8670            // Put the potential NaN in the RHS so that SSE will preserve it.
8671            std::swap(LHS, RHS);
8672          } else if (!DAG.isKnownNeverNaN(LHS))
8673            break;
8674        }
8675        Opcode = X86ISD::FMAX;
8676        break;
8677      case ISD::SETULE:
8678        // This can be a max, but if either operand is a NaN we need it to
8679        // preserve the original LHS.
8680        std::swap(LHS, RHS);
8681      case ISD::SETOLT:
8682      case ISD::SETLT:
8683      case ISD::SETLE:
8684        Opcode = X86ISD::FMAX;
8685        break;
8686      }
8687    }
8688
8689    if (Opcode)
8690      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
8691  }
8692
8693  // If this is a select between two integer constants, try to do some
8694  // optimizations.
8695  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
8696    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
8697      // Don't do this for crazy integer types.
8698      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
8699        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
8700        // so that TrueC (the true value) is larger than FalseC.
8701        bool NeedsCondInvert = false;
8702
8703        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
8704            // Efficiently invertible.
8705            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
8706             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
8707              isa<ConstantSDNode>(Cond.getOperand(1))))) {
8708          NeedsCondInvert = true;
8709          std::swap(TrueC, FalseC);
8710        }
8711
8712        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
8713        if (FalseC->getAPIntValue() == 0 &&
8714            TrueC->getAPIntValue().isPowerOf2()) {
8715          if (NeedsCondInvert) // Invert the condition if needed.
8716            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8717                               DAG.getConstant(1, Cond.getValueType()));
8718
8719          // Zero extend the condition if needed.
8720          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
8721
8722          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8723          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
8724                             DAG.getConstant(ShAmt, MVT::i8));
8725        }
8726
8727        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
8728        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8729          if (NeedsCondInvert) // Invert the condition if needed.
8730            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8731                               DAG.getConstant(1, Cond.getValueType()));
8732
8733          // Zero extend the condition if needed.
8734          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8735                             FalseC->getValueType(0), Cond);
8736          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8737                             SDValue(FalseC, 0));
8738        }
8739
8740        // Optimize cases that will turn into an LEA instruction.  This requires
8741        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
8742        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
8743          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
8744          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
8745
8746          bool isFastMultiplier = false;
8747          if (Diff < 10) {
8748            switch ((unsigned char)Diff) {
8749              default: break;
8750              case 1:  // result = add base, cond
8751              case 2:  // result = lea base(    , cond*2)
8752              case 3:  // result = lea base(cond, cond*2)
8753              case 4:  // result = lea base(    , cond*4)
8754              case 5:  // result = lea base(cond, cond*4)
8755              case 8:  // result = lea base(    , cond*8)
8756              case 9:  // result = lea base(cond, cond*8)
8757                isFastMultiplier = true;
8758                break;
8759            }
8760          }
8761
8762          if (isFastMultiplier) {
8763            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
8764            if (NeedsCondInvert) // Invert the condition if needed.
8765              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8766                                 DAG.getConstant(1, Cond.getValueType()));
8767
8768            // Zero extend the condition if needed.
8769            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
8770                               Cond);
8771            // Scale the condition by the difference.
8772            if (Diff != 1)
8773              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
8774                                 DAG.getConstant(Diff, Cond.getValueType()));
8775
8776            // Add the base if non-zero.
8777            if (FalseC->getAPIntValue() != 0)
8778              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8779                                 SDValue(FalseC, 0));
8780            return Cond;
8781          }
8782        }
8783      }
8784  }
8785
8786  return SDValue();
8787}
8788
8789/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
8790static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
8791                                  TargetLowering::DAGCombinerInfo &DCI) {
8792  DebugLoc DL = N->getDebugLoc();
8793
8794  // If the flag operand isn't dead, don't touch this CMOV.
8795  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
8796    return SDValue();
8797
8798  // If this is a select between two integer constants, try to do some
8799  // optimizations.  Note that the operands are ordered the opposite of SELECT
8800  // operands.
8801  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
8802    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
8803      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
8804      // larger than FalseC (the false value).
8805      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
8806
8807      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
8808        CC = X86::GetOppositeBranchCondition(CC);
8809        std::swap(TrueC, FalseC);
8810      }
8811
8812      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
8813      // This is efficient for any integer data type (including i8/i16) and
8814      // shift amount.
8815      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
8816        SDValue Cond = N->getOperand(3);
8817        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8818                           DAG.getConstant(CC, MVT::i8), Cond);
8819
8820        // Zero extend the condition if needed.
8821        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
8822
8823        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8824        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
8825                           DAG.getConstant(ShAmt, MVT::i8));
8826        if (N->getNumValues() == 2)  // Dead flag value?
8827          return DCI.CombineTo(N, Cond, SDValue());
8828        return Cond;
8829      }
8830
8831      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
8832      // for any integer data type, including i8/i16.
8833      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8834        SDValue Cond = N->getOperand(3);
8835        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8836                           DAG.getConstant(CC, MVT::i8), Cond);
8837
8838        // Zero extend the condition if needed.
8839        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8840                           FalseC->getValueType(0), Cond);
8841        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8842                           SDValue(FalseC, 0));
8843
8844        if (N->getNumValues() == 2)  // Dead flag value?
8845          return DCI.CombineTo(N, Cond, SDValue());
8846        return Cond;
8847      }
8848
8849      // Optimize cases that will turn into an LEA instruction.  This requires
8850      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
8851      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
8852        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
8853        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
8854
8855        bool isFastMultiplier = false;
8856        if (Diff < 10) {
8857          switch ((unsigned char)Diff) {
8858          default: break;
8859          case 1:  // result = add base, cond
8860          case 2:  // result = lea base(    , cond*2)
8861          case 3:  // result = lea base(cond, cond*2)
8862          case 4:  // result = lea base(    , cond*4)
8863          case 5:  // result = lea base(cond, cond*4)
8864          case 8:  // result = lea base(    , cond*8)
8865          case 9:  // result = lea base(cond, cond*8)
8866            isFastMultiplier = true;
8867            break;
8868          }
8869        }
8870
8871        if (isFastMultiplier) {
8872          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
8873          SDValue Cond = N->getOperand(3);
8874          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8875                             DAG.getConstant(CC, MVT::i8), Cond);
8876          // Zero extend the condition if needed.
8877          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
8878                             Cond);
8879          // Scale the condition by the difference.
8880          if (Diff != 1)
8881            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
8882                               DAG.getConstant(Diff, Cond.getValueType()));
8883
8884          // Add the base if non-zero.
8885          if (FalseC->getAPIntValue() != 0)
8886            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8887                               SDValue(FalseC, 0));
8888          if (N->getNumValues() == 2)  // Dead flag value?
8889            return DCI.CombineTo(N, Cond, SDValue());
8890          return Cond;
8891        }
8892      }
8893    }
8894  }
8895  return SDValue();
8896}
8897
8898
8899/// PerformMulCombine - Optimize a single multiply with constant into two
8900/// in order to implement it with two cheaper instructions, e.g.
8901/// LEA + SHL, LEA + LEA.
8902static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
8903                                 TargetLowering::DAGCombinerInfo &DCI) {
8904  if (DAG.getMachineFunction().
8905      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
8906    return SDValue();
8907
8908  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8909    return SDValue();
8910
8911  EVT VT = N->getValueType(0);
8912  if (VT != MVT::i64)
8913    return SDValue();
8914
8915  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8916  if (!C)
8917    return SDValue();
8918  uint64_t MulAmt = C->getZExtValue();
8919  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
8920    return SDValue();
8921
8922  uint64_t MulAmt1 = 0;
8923  uint64_t MulAmt2 = 0;
8924  if ((MulAmt % 9) == 0) {
8925    MulAmt1 = 9;
8926    MulAmt2 = MulAmt / 9;
8927  } else if ((MulAmt % 5) == 0) {
8928    MulAmt1 = 5;
8929    MulAmt2 = MulAmt / 5;
8930  } else if ((MulAmt % 3) == 0) {
8931    MulAmt1 = 3;
8932    MulAmt2 = MulAmt / 3;
8933  }
8934  if (MulAmt2 &&
8935      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
8936    DebugLoc DL = N->getDebugLoc();
8937
8938    if (isPowerOf2_64(MulAmt2) &&
8939        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
8940      // If second multiplifer is pow2, issue it first. We want the multiply by
8941      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
8942      // is an add.
8943      std::swap(MulAmt1, MulAmt2);
8944
8945    SDValue NewMul;
8946    if (isPowerOf2_64(MulAmt1))
8947      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
8948                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
8949    else
8950      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
8951                           DAG.getConstant(MulAmt1, VT));
8952
8953    if (isPowerOf2_64(MulAmt2))
8954      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
8955                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
8956    else
8957      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
8958                           DAG.getConstant(MulAmt2, VT));
8959
8960    // Do not add new nodes to DAG combiner worklist.
8961    DCI.CombineTo(N, NewMul, false);
8962  }
8963  return SDValue();
8964}
8965
8966static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
8967  SDValue N0 = N->getOperand(0);
8968  SDValue N1 = N->getOperand(1);
8969  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
8970  EVT VT = N0.getValueType();
8971
8972  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
8973  // since the result of setcc_c is all zero's or all ones.
8974  if (N1C && N0.getOpcode() == ISD::AND &&
8975      N0.getOperand(1).getOpcode() == ISD::Constant) {
8976    SDValue N00 = N0.getOperand(0);
8977    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
8978        ((N00.getOpcode() == ISD::ANY_EXTEND ||
8979          N00.getOpcode() == ISD::ZERO_EXTEND) &&
8980         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
8981      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
8982      APInt ShAmt = N1C->getAPIntValue();
8983      Mask = Mask.shl(ShAmt);
8984      if (Mask != 0)
8985        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
8986                           N00, DAG.getConstant(Mask, VT));
8987    }
8988  }
8989
8990  return SDValue();
8991}
8992
8993/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
8994///                       when possible.
8995static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
8996                                   const X86Subtarget *Subtarget) {
8997  EVT VT = N->getValueType(0);
8998  if (!VT.isVector() && VT.isInteger() &&
8999      N->getOpcode() == ISD::SHL)
9000    return PerformSHLCombine(N, DAG);
9001
9002  // On X86 with SSE2 support, we can transform this to a vector shift if
9003  // all elements are shifted by the same amount.  We can't do this in legalize
9004  // because the a constant vector is typically transformed to a constant pool
9005  // so we have no knowledge of the shift amount.
9006  if (!Subtarget->hasSSE2())
9007    return SDValue();
9008
9009  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
9010    return SDValue();
9011
9012  SDValue ShAmtOp = N->getOperand(1);
9013  EVT EltVT = VT.getVectorElementType();
9014  DebugLoc DL = N->getDebugLoc();
9015  SDValue BaseShAmt = SDValue();
9016  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
9017    unsigned NumElts = VT.getVectorNumElements();
9018    unsigned i = 0;
9019    for (; i != NumElts; ++i) {
9020      SDValue Arg = ShAmtOp.getOperand(i);
9021      if (Arg.getOpcode() == ISD::UNDEF) continue;
9022      BaseShAmt = Arg;
9023      break;
9024    }
9025    for (; i != NumElts; ++i) {
9026      SDValue Arg = ShAmtOp.getOperand(i);
9027      if (Arg.getOpcode() == ISD::UNDEF) continue;
9028      if (Arg != BaseShAmt) {
9029        return SDValue();
9030      }
9031    }
9032  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
9033             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
9034    SDValue InVec = ShAmtOp.getOperand(0);
9035    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
9036      unsigned NumElts = InVec.getValueType().getVectorNumElements();
9037      unsigned i = 0;
9038      for (; i != NumElts; ++i) {
9039        SDValue Arg = InVec.getOperand(i);
9040        if (Arg.getOpcode() == ISD::UNDEF) continue;
9041        BaseShAmt = Arg;
9042        break;
9043      }
9044    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
9045       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
9046         unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
9047         if (C->getZExtValue() == SplatIdx)
9048           BaseShAmt = InVec.getOperand(1);
9049       }
9050    }
9051    if (BaseShAmt.getNode() == 0)
9052      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
9053                              DAG.getIntPtrConstant(0));
9054  } else
9055    return SDValue();
9056
9057  // The shift amount is an i32.
9058  if (EltVT.bitsGT(MVT::i32))
9059    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
9060  else if (EltVT.bitsLT(MVT::i32))
9061    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
9062
9063  // The shift amount is identical so we can do a vector shift.
9064  SDValue  ValOp = N->getOperand(0);
9065  switch (N->getOpcode()) {
9066  default:
9067    llvm_unreachable("Unknown shift opcode!");
9068    break;
9069  case ISD::SHL:
9070    if (VT == MVT::v2i64)
9071      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9072                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9073                         ValOp, BaseShAmt);
9074    if (VT == MVT::v4i32)
9075      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9076                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9077                         ValOp, BaseShAmt);
9078    if (VT == MVT::v8i16)
9079      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9080                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9081                         ValOp, BaseShAmt);
9082    break;
9083  case ISD::SRA:
9084    if (VT == MVT::v4i32)
9085      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9086                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
9087                         ValOp, BaseShAmt);
9088    if (VT == MVT::v8i16)
9089      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9090                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
9091                         ValOp, BaseShAmt);
9092    break;
9093  case ISD::SRL:
9094    if (VT == MVT::v2i64)
9095      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9096                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9097                         ValOp, BaseShAmt);
9098    if (VT == MVT::v4i32)
9099      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9100                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
9101                         ValOp, BaseShAmt);
9102    if (VT ==  MVT::v8i16)
9103      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9104                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
9105                         ValOp, BaseShAmt);
9106    break;
9107  }
9108  return SDValue();
9109}
9110
9111/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
9112static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
9113                                   const X86Subtarget *Subtarget) {
9114  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
9115  // the FP state in cases where an emms may be missing.
9116  // A preferable solution to the general problem is to figure out the right
9117  // places to insert EMMS.  This qualifies as a quick hack.
9118
9119  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
9120  StoreSDNode *St = cast<StoreSDNode>(N);
9121  EVT VT = St->getValue().getValueType();
9122  if (VT.getSizeInBits() != 64)
9123    return SDValue();
9124
9125  const Function *F = DAG.getMachineFunction().getFunction();
9126  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
9127  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
9128    && Subtarget->hasSSE2();
9129  if ((VT.isVector() ||
9130       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
9131      isa<LoadSDNode>(St->getValue()) &&
9132      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
9133      St->getChain().hasOneUse() && !St->isVolatile()) {
9134    SDNode* LdVal = St->getValue().getNode();
9135    LoadSDNode *Ld = 0;
9136    int TokenFactorIndex = -1;
9137    SmallVector<SDValue, 8> Ops;
9138    SDNode* ChainVal = St->getChain().getNode();
9139    // Must be a store of a load.  We currently handle two cases:  the load
9140    // is a direct child, and it's under an intervening TokenFactor.  It is
9141    // possible to dig deeper under nested TokenFactors.
9142    if (ChainVal == LdVal)
9143      Ld = cast<LoadSDNode>(St->getChain());
9144    else if (St->getValue().hasOneUse() &&
9145             ChainVal->getOpcode() == ISD::TokenFactor) {
9146      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
9147        if (ChainVal->getOperand(i).getNode() == LdVal) {
9148          TokenFactorIndex = i;
9149          Ld = cast<LoadSDNode>(St->getValue());
9150        } else
9151          Ops.push_back(ChainVal->getOperand(i));
9152      }
9153    }
9154
9155    if (!Ld || !ISD::isNormalLoad(Ld))
9156      return SDValue();
9157
9158    // If this is not the MMX case, i.e. we are just turning i64 load/store
9159    // into f64 load/store, avoid the transformation if there are multiple
9160    // uses of the loaded value.
9161    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
9162      return SDValue();
9163
9164    DebugLoc LdDL = Ld->getDebugLoc();
9165    DebugLoc StDL = N->getDebugLoc();
9166    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
9167    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
9168    // pair instead.
9169    if (Subtarget->is64Bit() || F64IsLegal) {
9170      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
9171      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
9172                                  Ld->getBasePtr(), Ld->getSrcValue(),
9173                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
9174                                  Ld->getAlignment());
9175      SDValue NewChain = NewLd.getValue(1);
9176      if (TokenFactorIndex != -1) {
9177        Ops.push_back(NewChain);
9178        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9179                               Ops.size());
9180      }
9181      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
9182                          St->getSrcValue(), St->getSrcValueOffset(),
9183                          St->isVolatile(), St->getAlignment());
9184    }
9185
9186    // Otherwise, lower to two pairs of 32-bit loads / stores.
9187    SDValue LoAddr = Ld->getBasePtr();
9188    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
9189                                 DAG.getConstant(4, MVT::i32));
9190
9191    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
9192                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
9193                               Ld->isVolatile(), Ld->getAlignment());
9194    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
9195                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
9196                               Ld->isVolatile(),
9197                               MinAlign(Ld->getAlignment(), 4));
9198
9199    SDValue NewChain = LoLd.getValue(1);
9200    if (TokenFactorIndex != -1) {
9201      Ops.push_back(LoLd);
9202      Ops.push_back(HiLd);
9203      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9204                             Ops.size());
9205    }
9206
9207    LoAddr = St->getBasePtr();
9208    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
9209                         DAG.getConstant(4, MVT::i32));
9210
9211    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
9212                                St->getSrcValue(), St->getSrcValueOffset(),
9213                                St->isVolatile(), St->getAlignment());
9214    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
9215                                St->getSrcValue(),
9216                                St->getSrcValueOffset() + 4,
9217                                St->isVolatile(),
9218                                MinAlign(St->getAlignment(), 4));
9219    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
9220  }
9221  return SDValue();
9222}
9223
9224/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
9225/// X86ISD::FXOR nodes.
9226static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
9227  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
9228  // F[X]OR(0.0, x) -> x
9229  // F[X]OR(x, 0.0) -> x
9230  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9231    if (C->getValueAPF().isPosZero())
9232      return N->getOperand(1);
9233  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9234    if (C->getValueAPF().isPosZero())
9235      return N->getOperand(0);
9236  return SDValue();
9237}
9238
9239/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
9240static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
9241  // FAND(0.0, x) -> 0.0
9242  // FAND(x, 0.0) -> 0.0
9243  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9244    if (C->getValueAPF().isPosZero())
9245      return N->getOperand(0);
9246  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9247    if (C->getValueAPF().isPosZero())
9248      return N->getOperand(1);
9249  return SDValue();
9250}
9251
9252static SDValue PerformBTCombine(SDNode *N,
9253                                SelectionDAG &DAG,
9254                                TargetLowering::DAGCombinerInfo &DCI) {
9255  // BT ignores high bits in the bit index operand.
9256  SDValue Op1 = N->getOperand(1);
9257  if (Op1.hasOneUse()) {
9258    unsigned BitWidth = Op1.getValueSizeInBits();
9259    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
9260    APInt KnownZero, KnownOne;
9261    TargetLowering::TargetLoweringOpt TLO(DAG);
9262    TargetLowering &TLI = DAG.getTargetLoweringInfo();
9263    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
9264        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
9265      DCI.CommitTargetLoweringOpt(TLO);
9266  }
9267  return SDValue();
9268}
9269
9270static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
9271  SDValue Op = N->getOperand(0);
9272  if (Op.getOpcode() == ISD::BIT_CONVERT)
9273    Op = Op.getOperand(0);
9274  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
9275  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
9276      VT.getVectorElementType().getSizeInBits() ==
9277      OpVT.getVectorElementType().getSizeInBits()) {
9278    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
9279  }
9280  return SDValue();
9281}
9282
9283// On X86 and X86-64, atomic operations are lowered to locked instructions.
9284// Locked instructions, in turn, have implicit fence semantics (all memory
9285// operations are flushed before issuing the locked instruction, and the
9286// are not buffered), so we can fold away the common pattern of
9287// fence-atomic-fence.
9288static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
9289  SDValue atomic = N->getOperand(0);
9290  switch (atomic.getOpcode()) {
9291    case ISD::ATOMIC_CMP_SWAP:
9292    case ISD::ATOMIC_SWAP:
9293    case ISD::ATOMIC_LOAD_ADD:
9294    case ISD::ATOMIC_LOAD_SUB:
9295    case ISD::ATOMIC_LOAD_AND:
9296    case ISD::ATOMIC_LOAD_OR:
9297    case ISD::ATOMIC_LOAD_XOR:
9298    case ISD::ATOMIC_LOAD_NAND:
9299    case ISD::ATOMIC_LOAD_MIN:
9300    case ISD::ATOMIC_LOAD_MAX:
9301    case ISD::ATOMIC_LOAD_UMIN:
9302    case ISD::ATOMIC_LOAD_UMAX:
9303      break;
9304    default:
9305      return SDValue();
9306  }
9307
9308  SDValue fence = atomic.getOperand(0);
9309  if (fence.getOpcode() != ISD::MEMBARRIER)
9310    return SDValue();
9311
9312  switch (atomic.getOpcode()) {
9313    case ISD::ATOMIC_CMP_SWAP:
9314      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
9315                                    atomic.getOperand(1), atomic.getOperand(2),
9316                                    atomic.getOperand(3));
9317    case ISD::ATOMIC_SWAP:
9318    case ISD::ATOMIC_LOAD_ADD:
9319    case ISD::ATOMIC_LOAD_SUB:
9320    case ISD::ATOMIC_LOAD_AND:
9321    case ISD::ATOMIC_LOAD_OR:
9322    case ISD::ATOMIC_LOAD_XOR:
9323    case ISD::ATOMIC_LOAD_NAND:
9324    case ISD::ATOMIC_LOAD_MIN:
9325    case ISD::ATOMIC_LOAD_MAX:
9326    case ISD::ATOMIC_LOAD_UMIN:
9327    case ISD::ATOMIC_LOAD_UMAX:
9328      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
9329                                    atomic.getOperand(1), atomic.getOperand(2));
9330    default:
9331      return SDValue();
9332  }
9333}
9334
9335static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
9336  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
9337  //           (and (i32 x86isd::setcc_carry), 1)
9338  // This eliminates the zext. This transformation is necessary because
9339  // ISD::SETCC is always legalized to i8.
9340  DebugLoc dl = N->getDebugLoc();
9341  SDValue N0 = N->getOperand(0);
9342  EVT VT = N->getValueType(0);
9343  if (N0.getOpcode() == ISD::AND &&
9344      N0.hasOneUse() &&
9345      N0.getOperand(0).hasOneUse()) {
9346    SDValue N00 = N0.getOperand(0);
9347    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
9348      return SDValue();
9349    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9350    if (!C || C->getZExtValue() != 1)
9351      return SDValue();
9352    return DAG.getNode(ISD::AND, dl, VT,
9353                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
9354                                   N00.getOperand(0), N00.getOperand(1)),
9355                       DAG.getConstant(1, VT));
9356  }
9357
9358  return SDValue();
9359}
9360
9361SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
9362                                             DAGCombinerInfo &DCI) const {
9363  SelectionDAG &DAG = DCI.DAG;
9364  switch (N->getOpcode()) {
9365  default: break;
9366  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
9367  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
9368  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
9369  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
9370  case ISD::SHL:
9371  case ISD::SRA:
9372  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
9373  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
9374  case X86ISD::FXOR:
9375  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
9376  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
9377  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
9378  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
9379  case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
9380  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
9381  }
9382
9383  return SDValue();
9384}
9385
9386//===----------------------------------------------------------------------===//
9387//                           X86 Inline Assembly Support
9388//===----------------------------------------------------------------------===//
9389
9390static bool LowerToBSwap(CallInst *CI) {
9391  // FIXME: this should verify that we are targetting a 486 or better.  If not,
9392  // we will turn this bswap into something that will be lowered to logical ops
9393  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
9394  // so don't worry about this.
9395
9396  // Verify this is a simple bswap.
9397  if (CI->getNumOperands() != 2 ||
9398      CI->getType() != CI->getOperand(1)->getType() ||
9399      !CI->getType()->isInteger())
9400    return false;
9401
9402  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
9403  if (!Ty || Ty->getBitWidth() % 16 != 0)
9404    return false;
9405
9406  // Okay, we can do this xform, do so now.
9407  const Type *Tys[] = { Ty };
9408  Module *M = CI->getParent()->getParent()->getParent();
9409  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
9410
9411  Value *Op = CI->getOperand(1);
9412  Op = CallInst::Create(Int, Op, CI->getName(), CI);
9413
9414  CI->replaceAllUsesWith(Op);
9415  CI->eraseFromParent();
9416  return true;
9417}
9418
9419bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
9420  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
9421  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
9422
9423  std::string AsmStr = IA->getAsmString();
9424
9425  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
9426  std::vector<std::string> AsmPieces;
9427  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
9428
9429  switch (AsmPieces.size()) {
9430  default: return false;
9431  case 1:
9432    AsmStr = AsmPieces[0];
9433    AsmPieces.clear();
9434    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
9435
9436    // bswap $0
9437    if (AsmPieces.size() == 2 &&
9438        (AsmPieces[0] == "bswap" ||
9439         AsmPieces[0] == "bswapq" ||
9440         AsmPieces[0] == "bswapl") &&
9441        (AsmPieces[1] == "$0" ||
9442         AsmPieces[1] == "${0:q}")) {
9443      // No need to check constraints, nothing other than the equivalent of
9444      // "=r,0" would be valid here.
9445      return LowerToBSwap(CI);
9446    }
9447    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
9448    if (CI->getType() == Type::getInt16Ty(CI->getContext()) &&
9449        AsmPieces.size() == 3 &&
9450        AsmPieces[0] == "rorw" &&
9451        AsmPieces[1] == "$$8," &&
9452        AsmPieces[2] == "${0:w}" &&
9453        IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
9454      return LowerToBSwap(CI);
9455    }
9456    break;
9457  case 3:
9458    if (CI->getType() == Type::getInt64Ty(CI->getContext()) &&
9459        Constraints.size() >= 2 &&
9460        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
9461        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
9462      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
9463      std::vector<std::string> Words;
9464      SplitString(AsmPieces[0], Words, " \t");
9465      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
9466        Words.clear();
9467        SplitString(AsmPieces[1], Words, " \t");
9468        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
9469          Words.clear();
9470          SplitString(AsmPieces[2], Words, " \t,");
9471          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
9472              Words[2] == "%edx") {
9473            return LowerToBSwap(CI);
9474          }
9475        }
9476      }
9477    }
9478    break;
9479  }
9480  return false;
9481}
9482
9483
9484
9485/// getConstraintType - Given a constraint letter, return the type of
9486/// constraint it is for this target.
9487X86TargetLowering::ConstraintType
9488X86TargetLowering::getConstraintType(const std::string &Constraint) const {
9489  if (Constraint.size() == 1) {
9490    switch (Constraint[0]) {
9491    case 'A':
9492      return C_Register;
9493    case 'f':
9494    case 'r':
9495    case 'R':
9496    case 'l':
9497    case 'q':
9498    case 'Q':
9499    case 'x':
9500    case 'y':
9501    case 'Y':
9502      return C_RegisterClass;
9503    case 'e':
9504    case 'Z':
9505      return C_Other;
9506    default:
9507      break;
9508    }
9509  }
9510  return TargetLowering::getConstraintType(Constraint);
9511}
9512
9513/// LowerXConstraint - try to replace an X constraint, which matches anything,
9514/// with another that has more specific requirements based on the type of the
9515/// corresponding operand.
9516const char *X86TargetLowering::
9517LowerXConstraint(EVT ConstraintVT) const {
9518  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
9519  // 'f' like normal targets.
9520  if (ConstraintVT.isFloatingPoint()) {
9521    if (Subtarget->hasSSE2())
9522      return "Y";
9523    if (Subtarget->hasSSE1())
9524      return "x";
9525  }
9526
9527  return TargetLowering::LowerXConstraint(ConstraintVT);
9528}
9529
9530/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9531/// vector.  If it is invalid, don't add anything to Ops.
9532void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
9533                                                     char Constraint,
9534                                                     bool hasMemory,
9535                                                     std::vector<SDValue>&Ops,
9536                                                     SelectionDAG &DAG) const {
9537  SDValue Result(0, 0);
9538
9539  switch (Constraint) {
9540  default: break;
9541  case 'I':
9542    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9543      if (C->getZExtValue() <= 31) {
9544        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9545        break;
9546      }
9547    }
9548    return;
9549  case 'J':
9550    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9551      if (C->getZExtValue() <= 63) {
9552        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9553        break;
9554      }
9555    }
9556    return;
9557  case 'K':
9558    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9559      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
9560        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9561        break;
9562      }
9563    }
9564    return;
9565  case 'N':
9566    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9567      if (C->getZExtValue() <= 255) {
9568        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9569        break;
9570      }
9571    }
9572    return;
9573  case 'e': {
9574    // 32-bit signed value
9575    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9576      const ConstantInt *CI = C->getConstantIntValue();
9577      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
9578                                  C->getSExtValue())) {
9579        // Widen to 64 bits here to get it sign extended.
9580        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
9581        break;
9582      }
9583    // FIXME gcc accepts some relocatable values here too, but only in certain
9584    // memory models; it's complicated.
9585    }
9586    return;
9587  }
9588  case 'Z': {
9589    // 32-bit unsigned value
9590    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9591      const ConstantInt *CI = C->getConstantIntValue();
9592      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
9593                                  C->getZExtValue())) {
9594        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9595        break;
9596      }
9597    }
9598    // FIXME gcc accepts some relocatable values here too, but only in certain
9599    // memory models; it's complicated.
9600    return;
9601  }
9602  case 'i': {
9603    // Literal immediates are always ok.
9604    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
9605      // Widen to 64 bits here to get it sign extended.
9606      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
9607      break;
9608    }
9609
9610    // If we are in non-pic codegen mode, we allow the address of a global (with
9611    // an optional displacement) to be used with 'i'.
9612    GlobalAddressSDNode *GA = 0;
9613    int64_t Offset = 0;
9614
9615    // Match either (GA), (GA+C), (GA+C1+C2), etc.
9616    while (1) {
9617      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
9618        Offset += GA->getOffset();
9619        break;
9620      } else if (Op.getOpcode() == ISD::ADD) {
9621        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
9622          Offset += C->getZExtValue();
9623          Op = Op.getOperand(0);
9624          continue;
9625        }
9626      } else if (Op.getOpcode() == ISD::SUB) {
9627        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
9628          Offset += -C->getZExtValue();
9629          Op = Op.getOperand(0);
9630          continue;
9631        }
9632      }
9633
9634      // Otherwise, this isn't something we can handle, reject it.
9635      return;
9636    }
9637
9638    GlobalValue *GV = GA->getGlobal();
9639    // If we require an extra load to get this address, as in PIC mode, we
9640    // can't accept it.
9641    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
9642                                                        getTargetMachine())))
9643      return;
9644
9645    if (hasMemory)
9646      Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
9647    else
9648      Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
9649    Result = Op;
9650    break;
9651  }
9652  }
9653
9654  if (Result.getNode()) {
9655    Ops.push_back(Result);
9656    return;
9657  }
9658  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
9659                                                      Ops, DAG);
9660}
9661
9662std::vector<unsigned> X86TargetLowering::
9663getRegClassForInlineAsmConstraint(const std::string &Constraint,
9664                                  EVT VT) const {
9665  if (Constraint.size() == 1) {
9666    // FIXME: not handling fp-stack yet!
9667    switch (Constraint[0]) {      // GCC X86 Constraint Letters
9668    default: break;  // Unknown constraint letter
9669    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
9670      if (Subtarget->is64Bit()) {
9671        if (VT == MVT::i32)
9672          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
9673                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
9674                                       X86::R10D,X86::R11D,X86::R12D,
9675                                       X86::R13D,X86::R14D,X86::R15D,
9676                                       X86::EBP, X86::ESP, 0);
9677        else if (VT == MVT::i16)
9678          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
9679                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
9680                                       X86::R10W,X86::R11W,X86::R12W,
9681                                       X86::R13W,X86::R14W,X86::R15W,
9682                                       X86::BP,  X86::SP, 0);
9683        else if (VT == MVT::i8)
9684          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
9685                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
9686                                       X86::R10B,X86::R11B,X86::R12B,
9687                                       X86::R13B,X86::R14B,X86::R15B,
9688                                       X86::BPL, X86::SPL, 0);
9689
9690        else if (VT == MVT::i64)
9691          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
9692                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
9693                                       X86::R10, X86::R11, X86::R12,
9694                                       X86::R13, X86::R14, X86::R15,
9695                                       X86::RBP, X86::RSP, 0);
9696
9697        break;
9698      }
9699      // 32-bit fallthrough
9700    case 'Q':   // Q_REGS
9701      if (VT == MVT::i32)
9702        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
9703      else if (VT == MVT::i16)
9704        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
9705      else if (VT == MVT::i8)
9706        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
9707      else if (VT == MVT::i64)
9708        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
9709      break;
9710    }
9711  }
9712
9713  return std::vector<unsigned>();
9714}
9715
9716std::pair<unsigned, const TargetRegisterClass*>
9717X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
9718                                                EVT VT) const {
9719  // First, see if this is a constraint that directly corresponds to an LLVM
9720  // register class.
9721  if (Constraint.size() == 1) {
9722    // GCC Constraint Letters
9723    switch (Constraint[0]) {
9724    default: break;
9725    case 'r':   // GENERAL_REGS
9726    case 'l':   // INDEX_REGS
9727      if (VT == MVT::i8)
9728        return std::make_pair(0U, X86::GR8RegisterClass);
9729      if (VT == MVT::i16)
9730        return std::make_pair(0U, X86::GR16RegisterClass);
9731      if (VT == MVT::i32 || !Subtarget->is64Bit())
9732        return std::make_pair(0U, X86::GR32RegisterClass);
9733      return std::make_pair(0U, X86::GR64RegisterClass);
9734    case 'R':   // LEGACY_REGS
9735      if (VT == MVT::i8)
9736        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
9737      if (VT == MVT::i16)
9738        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
9739      if (VT == MVT::i32 || !Subtarget->is64Bit())
9740        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
9741      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
9742    case 'f':  // FP Stack registers.
9743      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
9744      // value to the correct fpstack register class.
9745      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
9746        return std::make_pair(0U, X86::RFP32RegisterClass);
9747      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
9748        return std::make_pair(0U, X86::RFP64RegisterClass);
9749      return std::make_pair(0U, X86::RFP80RegisterClass);
9750    case 'y':   // MMX_REGS if MMX allowed.
9751      if (!Subtarget->hasMMX()) break;
9752      return std::make_pair(0U, X86::VR64RegisterClass);
9753    case 'Y':   // SSE_REGS if SSE2 allowed
9754      if (!Subtarget->hasSSE2()) break;
9755      // FALL THROUGH.
9756    case 'x':   // SSE_REGS if SSE1 allowed
9757      if (!Subtarget->hasSSE1()) break;
9758
9759      switch (VT.getSimpleVT().SimpleTy) {
9760      default: break;
9761      // Scalar SSE types.
9762      case MVT::f32:
9763      case MVT::i32:
9764        return std::make_pair(0U, X86::FR32RegisterClass);
9765      case MVT::f64:
9766      case MVT::i64:
9767        return std::make_pair(0U, X86::FR64RegisterClass);
9768      // Vector types.
9769      case MVT::v16i8:
9770      case MVT::v8i16:
9771      case MVT::v4i32:
9772      case MVT::v2i64:
9773      case MVT::v4f32:
9774      case MVT::v2f64:
9775        return std::make_pair(0U, X86::VR128RegisterClass);
9776      }
9777      break;
9778    }
9779  }
9780
9781  // Use the default implementation in TargetLowering to convert the register
9782  // constraint into a member of a register class.
9783  std::pair<unsigned, const TargetRegisterClass*> Res;
9784  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
9785
9786  // Not found as a standard register?
9787  if (Res.second == 0) {
9788    // Map st(0) -> st(7) -> ST0
9789    if (Constraint.size() == 7 && Constraint[0] == '{' &&
9790        tolower(Constraint[1]) == 's' &&
9791        tolower(Constraint[2]) == 't' &&
9792        Constraint[3] == '(' &&
9793        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
9794        Constraint[5] == ')' &&
9795        Constraint[6] == '}') {
9796
9797      Res.first = X86::ST0+Constraint[4]-'0';
9798      Res.second = X86::RFP80RegisterClass;
9799      return Res;
9800    }
9801
9802    // GCC allows "st(0)" to be called just plain "st".
9803    if (StringRef("{st}").equals_lower(Constraint)) {
9804      Res.first = X86::ST0;
9805      Res.second = X86::RFP80RegisterClass;
9806      return Res;
9807    }
9808
9809    // flags -> EFLAGS
9810    if (StringRef("{flags}").equals_lower(Constraint)) {
9811      Res.first = X86::EFLAGS;
9812      Res.second = X86::CCRRegisterClass;
9813      return Res;
9814    }
9815
9816    // 'A' means EAX + EDX.
9817    if (Constraint == "A") {
9818      Res.first = X86::EAX;
9819      Res.second = X86::GR32_ADRegisterClass;
9820      return Res;
9821    }
9822    return Res;
9823  }
9824
9825  // Otherwise, check to see if this is a register class of the wrong value
9826  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
9827  // turn into {ax},{dx}.
9828  if (Res.second->hasType(VT))
9829    return Res;   // Correct type already, nothing to do.
9830
9831  // All of the single-register GCC register classes map their values onto
9832  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
9833  // really want an 8-bit or 32-bit register, map to the appropriate register
9834  // class and return the appropriate register.
9835  if (Res.second == X86::GR16RegisterClass) {
9836    if (VT == MVT::i8) {
9837      unsigned DestReg = 0;
9838      switch (Res.first) {
9839      default: break;
9840      case X86::AX: DestReg = X86::AL; break;
9841      case X86::DX: DestReg = X86::DL; break;
9842      case X86::CX: DestReg = X86::CL; break;
9843      case X86::BX: DestReg = X86::BL; break;
9844      }
9845      if (DestReg) {
9846        Res.first = DestReg;
9847        Res.second = X86::GR8RegisterClass;
9848      }
9849    } else if (VT == MVT::i32) {
9850      unsigned DestReg = 0;
9851      switch (Res.first) {
9852      default: break;
9853      case X86::AX: DestReg = X86::EAX; break;
9854      case X86::DX: DestReg = X86::EDX; break;
9855      case X86::CX: DestReg = X86::ECX; break;
9856      case X86::BX: DestReg = X86::EBX; break;
9857      case X86::SI: DestReg = X86::ESI; break;
9858      case X86::DI: DestReg = X86::EDI; break;
9859      case X86::BP: DestReg = X86::EBP; break;
9860      case X86::SP: DestReg = X86::ESP; break;
9861      }
9862      if (DestReg) {
9863        Res.first = DestReg;
9864        Res.second = X86::GR32RegisterClass;
9865      }
9866    } else if (VT == MVT::i64) {
9867      unsigned DestReg = 0;
9868      switch (Res.first) {
9869      default: break;
9870      case X86::AX: DestReg = X86::RAX; break;
9871      case X86::DX: DestReg = X86::RDX; break;
9872      case X86::CX: DestReg = X86::RCX; break;
9873      case X86::BX: DestReg = X86::RBX; break;
9874      case X86::SI: DestReg = X86::RSI; break;
9875      case X86::DI: DestReg = X86::RDI; break;
9876      case X86::BP: DestReg = X86::RBP; break;
9877      case X86::SP: DestReg = X86::RSP; break;
9878      }
9879      if (DestReg) {
9880        Res.first = DestReg;
9881        Res.second = X86::GR64RegisterClass;
9882      }
9883    }
9884  } else if (Res.second == X86::FR32RegisterClass ||
9885             Res.second == X86::FR64RegisterClass ||
9886             Res.second == X86::VR128RegisterClass) {
9887    // Handle references to XMM physical registers that got mapped into the
9888    // wrong class.  This can happen with constraints like {xmm0} where the
9889    // target independent register mapper will just pick the first match it can
9890    // find, ignoring the required type.
9891    if (VT == MVT::f32)
9892      Res.second = X86::FR32RegisterClass;
9893    else if (VT == MVT::f64)
9894      Res.second = X86::FR64RegisterClass;
9895    else if (X86::VR128RegisterClass->hasType(VT))
9896      Res.second = X86::VR128RegisterClass;
9897  }
9898
9899  return Res;
9900}
9901
9902//===----------------------------------------------------------------------===//
9903//                           X86 Widen vector type
9904//===----------------------------------------------------------------------===//
9905
9906/// getWidenVectorType: given a vector type, returns the type to widen
9907/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
9908/// If there is no vector type that we want to widen to, returns MVT::Other
9909/// When and where to widen is target dependent based on the cost of
9910/// scalarizing vs using the wider vector type.
9911
9912EVT X86TargetLowering::getWidenVectorType(EVT VT) const {
9913  assert(VT.isVector());
9914  if (isTypeLegal(VT))
9915    return VT;
9916
9917  // TODO: In computeRegisterProperty, we can compute the list of legal vector
9918  //       type based on element type.  This would speed up our search (though
9919  //       it may not be worth it since the size of the list is relatively
9920  //       small).
9921  EVT EltVT = VT.getVectorElementType();
9922  unsigned NElts = VT.getVectorNumElements();
9923
9924  // On X86, it make sense to widen any vector wider than 1
9925  if (NElts <= 1)
9926    return MVT::Other;
9927
9928  for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
9929       nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
9930    EVT SVT = (MVT::SimpleValueType)nVT;
9931
9932    if (isTypeLegal(SVT) &&
9933        SVT.getVectorElementType() == EltVT &&
9934        SVT.getVectorNumElements() > NElts)
9935      return SVT;
9936  }
9937  return MVT::Other;
9938}
9939