X86ISelLowering.cpp revision 37f32ee7ffe77d7c2bc1b185802e98979612f041
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86.h"
17#include "X86InstrBuilder.h"
18#include "X86ISelLowering.h"
19#include "X86TargetMachine.h"
20#include "X86TargetObjectFile.h"
21#include "llvm/CallingConv.h"
22#include "llvm/Constants.h"
23#include "llvm/DerivedTypes.h"
24#include "llvm/GlobalAlias.h"
25#include "llvm/GlobalVariable.h"
26#include "llvm/Function.h"
27#include "llvm/Instructions.h"
28#include "llvm/Intrinsics.h"
29#include "llvm/LLVMContext.h"
30#include "llvm/CodeGen/MachineFrameInfo.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineJumpTableInfo.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/PseudoSourceValue.h"
37#include "llvm/MC/MCAsmInfo.h"
38#include "llvm/MC/MCContext.h"
39#include "llvm/MC/MCExpr.h"
40#include "llvm/MC/MCSymbol.h"
41#include "llvm/ADT/BitVector.h"
42#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
44#include "llvm/ADT/StringExtras.h"
45#include "llvm/ADT/VectorExtras.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/Dwarf.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/MathExtras.h"
51#include "llvm/Support/raw_ostream.h"
52using namespace llvm;
53using namespace dwarf;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
59
60// Disable16Bit - 16-bit operations typically have a larger encoding than
61// corresponding 32-bit instructions, and 16-bit code is slow on some
62// processors. This is an experimental flag to disable 16-bit operations
63// (which forces them to be Legalized to 32-bit operations).
64static cl::opt<bool>
65Disable16Bit("disable-16bit", cl::Hidden,
66             cl::desc("Disable use of 16-bit instructions"));
67static cl::opt<bool>
68Promote16Bit("promote-16bit", cl::Hidden,
69             cl::desc("Promote 16-bit instructions"));
70
71// Forward declarations.
72static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
73                       SDValue V2);
74
75static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
76  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
77  default: llvm_unreachable("unknown subtarget type");
78  case X86Subtarget::isDarwin:
79    if (TM.getSubtarget<X86Subtarget>().is64Bit())
80      return new X8664_MachoTargetObjectFile();
81    return new TargetLoweringObjectFileMachO();
82  case X86Subtarget::isELF:
83   if (TM.getSubtarget<X86Subtarget>().is64Bit())
84     return new X8664_ELFTargetObjectFile(TM);
85    return new X8632_ELFTargetObjectFile(TM);
86  case X86Subtarget::isMingw:
87  case X86Subtarget::isCygwin:
88  case X86Subtarget::isWindows:
89    return new TargetLoweringObjectFileCOFF();
90  }
91}
92
93X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
94  : TargetLowering(TM, createTLOF(TM)) {
95  Subtarget = &TM.getSubtarget<X86Subtarget>();
96  X86ScalarSSEf64 = Subtarget->hasSSE2();
97  X86ScalarSSEf32 = Subtarget->hasSSE1();
98  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
99
100  RegInfo = TM.getRegisterInfo();
101  TD = getTargetData();
102
103  // Set up the TargetLowering object.
104
105  // X86 is weird, it always uses i8 for shift amounts and setcc results.
106  setShiftAmountType(MVT::i8);
107  setBooleanContents(ZeroOrOneBooleanContent);
108  setSchedulingPreference(SchedulingForRegPressure);
109  setStackPointerRegisterToSaveRestore(X86StackPtr);
110
111  if (Subtarget->isTargetDarwin()) {
112    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
113    setUseUnderscoreSetJmp(false);
114    setUseUnderscoreLongJmp(false);
115  } else if (Subtarget->isTargetMingw()) {
116    // MS runtime is weird: it exports _setjmp, but longjmp!
117    setUseUnderscoreSetJmp(true);
118    setUseUnderscoreLongJmp(false);
119  } else {
120    setUseUnderscoreSetJmp(true);
121    setUseUnderscoreLongJmp(true);
122  }
123
124  // Set up the register classes.
125  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
126  if (!Disable16Bit)
127    addRegisterClass(MVT::i16, X86::GR16RegisterClass);
128  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
129  if (Subtarget->is64Bit())
130    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
131
132  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
133
134  // We don't accept any truncstore of integer registers.
135  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
136  if (!Disable16Bit)
137    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
138  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
139  if (!Disable16Bit)
140    setTruncStoreAction(MVT::i32, MVT::i16, Expand);
141  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
142  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
143
144  // SETOEQ and SETUNE require checking two conditions.
145  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
146  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
147  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
148  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
149  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
150  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
151
152  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
153  // operation.
154  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
155  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
156  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
157
158  if (Subtarget->is64Bit()) {
159    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
160    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
161  } else if (!UseSoftFloat) {
162    if (X86ScalarSSEf64) {
163      // We have an impenetrably clever algorithm for ui64->double only.
164      setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
165    }
166    // We have an algorithm for SSE2, and we turn this into a 64-bit
167    // FILD for other targets.
168    setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
169  }
170
171  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
172  // this operation.
173  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
174  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
175
176  if (!UseSoftFloat) {
177    // SSE has no i16 to fp conversion, only i32
178    if (X86ScalarSSEf32) {
179      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
180      // f32 and f64 cases are Legal, f80 case is not
181      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
182    } else {
183      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
184      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
185    }
186  } else {
187    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
188    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
189  }
190
191  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
192  // are Legal, f80 is custom lowered.
193  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
194  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
195
196  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
197  // this operation.
198  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
199  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
200
201  if (X86ScalarSSEf32) {
202    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
203    // f32 and f64 cases are Legal, f80 case is not
204    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
205  } else {
206    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
207    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
208  }
209
210  // Handle FP_TO_UINT by promoting the destination to a larger signed
211  // conversion.
212  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
213  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
214  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
215
216  if (Subtarget->is64Bit()) {
217    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
218    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
219  } else if (!UseSoftFloat) {
220    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
221      // Expand FP_TO_UINT into a select.
222      // FIXME: We would like to use a Custom expander here eventually to do
223      // the optimal thing for SSE vs. the default expansion in the legalizer.
224      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
225    else
226      // With SSE3 we can use fisttpll to convert to a signed i64; without
227      // SSE, we're stuck with a fistpll.
228      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
229  }
230
231  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
232  if (!X86ScalarSSEf64) {
233    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
234    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
235  }
236
237  // Scalar integer divide and remainder are lowered to use operations that
238  // produce two results, to match the available instructions. This exposes
239  // the two-result form to trivial CSE, which is able to combine x/y and x%y
240  // into a single instruction.
241  //
242  // Scalar integer multiply-high is also lowered to use two-result
243  // operations, to match the available instructions. However, plain multiply
244  // (low) operations are left as Legal, as there are single-result
245  // instructions for this in x86. Using the two-result multiply instructions
246  // when both high and low results are needed must be arranged by dagcombine.
247  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
248  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
249  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
250  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
251  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
252  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
253  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
254  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
255  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
256  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
257  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
258  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
259  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
260  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
261  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
262  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
263  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
264  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
265  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
266  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
267  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
268  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
269  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
270  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
271
272  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
273  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
274  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
275  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
276  if (Subtarget->is64Bit())
277    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
278  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
279  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
280  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
281  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
282  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
283  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
284  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
285  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
286
287  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
288  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
289  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
290  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
291  if (Disable16Bit) {
292    setOperationAction(ISD::CTTZ           , MVT::i16  , Expand);
293    setOperationAction(ISD::CTLZ           , MVT::i16  , Expand);
294  } else {
295    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
296    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
297  }
298  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
299  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
300  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
301  if (Subtarget->is64Bit()) {
302    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
303    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
304    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
305  }
306
307  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
308  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
309
310  // These should be promoted to a larger select which is supported.
311  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
312  // X86 wants to expand cmov itself.
313  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
314  if (Disable16Bit)
315    setOperationAction(ISD::SELECT        , MVT::i16  , Expand);
316  else
317    setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
318  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
319  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
320  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
321  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
322  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
323  if (Disable16Bit)
324    setOperationAction(ISD::SETCC         , MVT::i16  , Expand);
325  else
326    setOperationAction(ISD::SETCC         , MVT::i16  , Custom);
327  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
328  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
329  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
330  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
331  if (Subtarget->is64Bit()) {
332    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
333    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
334  }
335  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
336
337  // Darwin ABI issue.
338  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
339  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
340  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
341  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
342  if (Subtarget->is64Bit())
343    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
344  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
345  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
346  if (Subtarget->is64Bit()) {
347    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
348    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
349    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
350    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
351    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
352  }
353  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
354  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
355  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
356  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
357  if (Subtarget->is64Bit()) {
358    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
359    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
360    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
361  }
362
363  if (Subtarget->hasSSE1())
364    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
365
366  if (!Subtarget->hasSSE2())
367    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
368
369  // Expand certain atomics
370  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
371  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
372  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
373  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
374
375  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
376  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
377  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
378  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
379
380  if (!Subtarget->is64Bit()) {
381    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
382    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
383    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
384    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
385    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
386    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
387    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
388  }
389
390  // FIXME - use subtarget debug flags
391  if (!Subtarget->isTargetDarwin() &&
392      !Subtarget->isTargetELF() &&
393      !Subtarget->isTargetCygMing()) {
394    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
395  }
396
397  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
398  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
399  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
400  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
401  if (Subtarget->is64Bit()) {
402    setExceptionPointerRegister(X86::RAX);
403    setExceptionSelectorRegister(X86::RDX);
404  } else {
405    setExceptionPointerRegister(X86::EAX);
406    setExceptionSelectorRegister(X86::EDX);
407  }
408  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
409  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
410
411  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
412
413  setOperationAction(ISD::TRAP, MVT::Other, Legal);
414
415  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
416  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
417  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
418  if (Subtarget->is64Bit()) {
419    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
420    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
421  } else {
422    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
423    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
424  }
425
426  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
427  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
428  if (Subtarget->is64Bit())
429    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
430  if (Subtarget->isTargetCygMing())
431    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
432  else
433    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
434
435  if (!UseSoftFloat && X86ScalarSSEf64) {
436    // f32 and f64 use SSE.
437    // Set up the FP register classes.
438    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
439    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
440
441    // Use ANDPD to simulate FABS.
442    setOperationAction(ISD::FABS , MVT::f64, Custom);
443    setOperationAction(ISD::FABS , MVT::f32, Custom);
444
445    // Use XORP to simulate FNEG.
446    setOperationAction(ISD::FNEG , MVT::f64, Custom);
447    setOperationAction(ISD::FNEG , MVT::f32, Custom);
448
449    // Use ANDPD and ORPD to simulate FCOPYSIGN.
450    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
451    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
452
453    // We don't support sin/cos/fmod
454    setOperationAction(ISD::FSIN , MVT::f64, Expand);
455    setOperationAction(ISD::FCOS , MVT::f64, Expand);
456    setOperationAction(ISD::FSIN , MVT::f32, Expand);
457    setOperationAction(ISD::FCOS , MVT::f32, Expand);
458
459    // Expand FP immediates into loads from the stack, except for the special
460    // cases we handle.
461    addLegalFPImmediate(APFloat(+0.0)); // xorpd
462    addLegalFPImmediate(APFloat(+0.0f)); // xorps
463  } else if (!UseSoftFloat && X86ScalarSSEf32) {
464    // Use SSE for f32, x87 for f64.
465    // Set up the FP register classes.
466    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
467    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
468
469    // Use ANDPS to simulate FABS.
470    setOperationAction(ISD::FABS , MVT::f32, Custom);
471
472    // Use XORP to simulate FNEG.
473    setOperationAction(ISD::FNEG , MVT::f32, Custom);
474
475    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
476
477    // Use ANDPS and ORPS to simulate FCOPYSIGN.
478    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
479    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
480
481    // We don't support sin/cos/fmod
482    setOperationAction(ISD::FSIN , MVT::f32, Expand);
483    setOperationAction(ISD::FCOS , MVT::f32, Expand);
484
485    // Special cases we handle for FP constants.
486    addLegalFPImmediate(APFloat(+0.0f)); // xorps
487    addLegalFPImmediate(APFloat(+0.0)); // FLD0
488    addLegalFPImmediate(APFloat(+1.0)); // FLD1
489    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
490    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
491
492    if (!UnsafeFPMath) {
493      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
494      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
495    }
496  } else if (!UseSoftFloat) {
497    // f32 and f64 in x87.
498    // Set up the FP register classes.
499    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
500    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
501
502    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
503    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
504    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
505    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
506
507    if (!UnsafeFPMath) {
508      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
509      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
510    }
511    addLegalFPImmediate(APFloat(+0.0)); // FLD0
512    addLegalFPImmediate(APFloat(+1.0)); // FLD1
513    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
514    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
515    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
516    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
517    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
518    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
519  }
520
521  // Long double always uses X87.
522  if (!UseSoftFloat) {
523    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
524    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
525    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
526    {
527      bool ignored;
528      APFloat TmpFlt(+0.0);
529      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
530                     &ignored);
531      addLegalFPImmediate(TmpFlt);  // FLD0
532      TmpFlt.changeSign();
533      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
534      APFloat TmpFlt2(+1.0);
535      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
536                      &ignored);
537      addLegalFPImmediate(TmpFlt2);  // FLD1
538      TmpFlt2.changeSign();
539      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
540    }
541
542    if (!UnsafeFPMath) {
543      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
544      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
545    }
546  }
547
548  // Always use a library call for pow.
549  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
550  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
551  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
552
553  setOperationAction(ISD::FLOG, MVT::f80, Expand);
554  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
555  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
556  setOperationAction(ISD::FEXP, MVT::f80, Expand);
557  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
558
559  // First set operation action for all vector types to either promote
560  // (for widening) or expand (for scalarization). Then we will selectively
561  // turn on ones that can be effectively codegen'd.
562  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
563       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
564    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
565    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
566    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
567    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
568    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
569    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
570    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
571    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
572    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
573    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
574    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
575    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
576    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
577    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
578    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
579    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
580    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
581    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
582    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
583    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
584    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
585    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
586    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
587    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
588    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
589    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
590    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
591    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
592    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
593    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
594    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
595    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
596    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
597    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
598    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
599    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
600    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
601    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
602    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
603    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
604    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
605    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
606    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
607    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
608    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
609    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
610    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
611    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
612    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
613    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
614    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
615    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
616    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
617    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
618         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
619      setTruncStoreAction((MVT::SimpleValueType)VT,
620                          (MVT::SimpleValueType)InnerVT, Expand);
621    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
622    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
623    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
624  }
625
626  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
627  // with -msoft-float, disable use of MMX as well.
628  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
629    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
630    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
631    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
632    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
633    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
634
635    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
636    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
637    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
638    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
639
640    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
641    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
642    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
643    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
644
645    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
646    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
647
648    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
649    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
650    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
651    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
652    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
653    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
654    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
655
656    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
657    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
658    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
659    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
660    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
661    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
662    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
663
664    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
665    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
666    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
667    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
668    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
669    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
670    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
671
672    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
673    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
674    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
675    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
676    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
677    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
678    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
679    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
680    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
681
682    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
683    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
684    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
685    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
686    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
687
688    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
689    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
690    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
691    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
692
693    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
694    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
695    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
696    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
697
698    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
699
700    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
701    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
702    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
703    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
704    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
705    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
706    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
707  }
708
709  if (!UseSoftFloat && Subtarget->hasSSE1()) {
710    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
711
712    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
713    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
714    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
715    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
716    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
717    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
718    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
719    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
720    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
721    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
722    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
723    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
724  }
725
726  if (!UseSoftFloat && Subtarget->hasSSE2()) {
727    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
728
729    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
730    // registers cannot be used even for integer operations.
731    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
732    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
733    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
734    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
735
736    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
737    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
738    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
739    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
740    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
741    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
742    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
743    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
744    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
745    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
746    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
747    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
748    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
749    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
750    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
751    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
752
753    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
754    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
755    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
756    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
757
758    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
759    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
760    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
761    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
762    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
763
764    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
765    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
766    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
767    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
768    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
769
770    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
771    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
772      EVT VT = (MVT::SimpleValueType)i;
773      // Do not attempt to custom lower non-power-of-2 vectors
774      if (!isPowerOf2_32(VT.getVectorNumElements()))
775        continue;
776      // Do not attempt to custom lower non-128-bit vectors
777      if (!VT.is128BitVector())
778        continue;
779      setOperationAction(ISD::BUILD_VECTOR,
780                         VT.getSimpleVT().SimpleTy, Custom);
781      setOperationAction(ISD::VECTOR_SHUFFLE,
782                         VT.getSimpleVT().SimpleTy, Custom);
783      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
784                         VT.getSimpleVT().SimpleTy, Custom);
785    }
786
787    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
788    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
789    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
790    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
791    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
792    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
793
794    if (Subtarget->is64Bit()) {
795      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
796      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
797    }
798
799    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
800    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
801      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
802      EVT VT = SVT;
803
804      // Do not attempt to promote non-128-bit vectors
805      if (!VT.is128BitVector()) {
806        continue;
807      }
808
809      setOperationAction(ISD::AND,    SVT, Promote);
810      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
811      setOperationAction(ISD::OR,     SVT, Promote);
812      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
813      setOperationAction(ISD::XOR,    SVT, Promote);
814      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
815      setOperationAction(ISD::LOAD,   SVT, Promote);
816      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
817      setOperationAction(ISD::SELECT, SVT, Promote);
818      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
819    }
820
821    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
822
823    // Custom lower v2i64 and v2f64 selects.
824    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
825    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
826    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
827    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
828
829    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
830    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
831    if (!DisableMMX && Subtarget->hasMMX()) {
832      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
833      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
834    }
835  }
836
837  if (Subtarget->hasSSE41()) {
838    // FIXME: Do we need to handle scalar-to-vector here?
839    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
840
841    // i8 and i16 vectors are custom , because the source register and source
842    // source memory operand types are not the same width.  f32 vectors are
843    // custom since the immediate controlling the insert encodes additional
844    // information.
845    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
846    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
847    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
848    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
849
850    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
851    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
852    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
853    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
854
855    if (Subtarget->is64Bit()) {
856      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
857      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
858    }
859  }
860
861  if (Subtarget->hasSSE42()) {
862    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
863  }
864
865  if (!UseSoftFloat && Subtarget->hasAVX()) {
866    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
867    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
868    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
869    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
870
871    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
872    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
873    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
874    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
875    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
876    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
877    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
878    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
879    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
880    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
881    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
882    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
883    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
884    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
885    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
886
887    // Operations to consider commented out -v16i16 v32i8
888    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
889    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
890    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
891    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
892    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
893    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
894    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
895    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
896    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
897    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
898    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
899    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
900    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
901    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
902
903    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
904    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
905    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
906    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
907
908    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
909    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
910    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
911    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
912    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
913
914    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
915    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
916    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
917    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
918    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
919    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
920
921#if 0
922    // Not sure we want to do this since there are no 256-bit integer
923    // operations in AVX
924
925    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
926    // This includes 256-bit vectors
927    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
928      EVT VT = (MVT::SimpleValueType)i;
929
930      // Do not attempt to custom lower non-power-of-2 vectors
931      if (!isPowerOf2_32(VT.getVectorNumElements()))
932        continue;
933
934      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
935      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
936      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
937    }
938
939    if (Subtarget->is64Bit()) {
940      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
941      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
942    }
943#endif
944
945#if 0
946    // Not sure we want to do this since there are no 256-bit integer
947    // operations in AVX
948
949    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
950    // Including 256-bit vectors
951    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
952      EVT VT = (MVT::SimpleValueType)i;
953
954      if (!VT.is256BitVector()) {
955        continue;
956      }
957      setOperationAction(ISD::AND,    VT, Promote);
958      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
959      setOperationAction(ISD::OR,     VT, Promote);
960      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
961      setOperationAction(ISD::XOR,    VT, Promote);
962      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
963      setOperationAction(ISD::LOAD,   VT, Promote);
964      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
965      setOperationAction(ISD::SELECT, VT, Promote);
966      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
967    }
968
969    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
970#endif
971  }
972
973  // We want to custom lower some of our intrinsics.
974  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
975
976  // Add/Sub/Mul with overflow operations are custom lowered.
977  setOperationAction(ISD::SADDO, MVT::i32, Custom);
978  setOperationAction(ISD::SADDO, MVT::i64, Custom);
979  setOperationAction(ISD::UADDO, MVT::i32, Custom);
980  setOperationAction(ISD::UADDO, MVT::i64, Custom);
981  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
982  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
983  setOperationAction(ISD::USUBO, MVT::i32, Custom);
984  setOperationAction(ISD::USUBO, MVT::i64, Custom);
985  setOperationAction(ISD::SMULO, MVT::i32, Custom);
986  setOperationAction(ISD::SMULO, MVT::i64, Custom);
987
988  if (!Subtarget->is64Bit()) {
989    // These libcalls are not available in 32-bit.
990    setLibcallName(RTLIB::SHL_I128, 0);
991    setLibcallName(RTLIB::SRL_I128, 0);
992    setLibcallName(RTLIB::SRA_I128, 0);
993  }
994
995  // We have target-specific dag combine patterns for the following nodes:
996  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
997  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
998  setTargetDAGCombine(ISD::BUILD_VECTOR);
999  setTargetDAGCombine(ISD::SELECT);
1000  setTargetDAGCombine(ISD::SHL);
1001  setTargetDAGCombine(ISD::SRA);
1002  setTargetDAGCombine(ISD::SRL);
1003  setTargetDAGCombine(ISD::OR);
1004  setTargetDAGCombine(ISD::STORE);
1005  setTargetDAGCombine(ISD::MEMBARRIER);
1006  setTargetDAGCombine(ISD::ZERO_EXTEND);
1007  if (Subtarget->is64Bit())
1008    setTargetDAGCombine(ISD::MUL);
1009
1010  computeRegisterProperties();
1011
1012  // FIXME: These should be based on subtarget info. Plus, the values should
1013  // be smaller when we are in optimizing for size mode.
1014  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1015  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1016  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
1017  setPrefLoopAlignment(16);
1018  benefitFromCodePlacementOpt = true;
1019}
1020
1021
1022MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1023  return MVT::i8;
1024}
1025
1026
1027/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1028/// the desired ByVal argument alignment.
1029static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
1030  if (MaxAlign == 16)
1031    return;
1032  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1033    if (VTy->getBitWidth() == 128)
1034      MaxAlign = 16;
1035  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1036    unsigned EltAlign = 0;
1037    getMaxByValAlign(ATy->getElementType(), EltAlign);
1038    if (EltAlign > MaxAlign)
1039      MaxAlign = EltAlign;
1040  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
1041    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1042      unsigned EltAlign = 0;
1043      getMaxByValAlign(STy->getElementType(i), EltAlign);
1044      if (EltAlign > MaxAlign)
1045        MaxAlign = EltAlign;
1046      if (MaxAlign == 16)
1047        break;
1048    }
1049  }
1050  return;
1051}
1052
1053/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1054/// function arguments in the caller parameter area. For X86, aggregates
1055/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1056/// are at 4-byte boundaries.
1057unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1058  if (Subtarget->is64Bit()) {
1059    // Max of 8 and alignment of type.
1060    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1061    if (TyAlign > 8)
1062      return TyAlign;
1063    return 8;
1064  }
1065
1066  unsigned Align = 4;
1067  if (Subtarget->hasSSE1())
1068    getMaxByValAlign(Ty, Align);
1069  return Align;
1070}
1071
1072/// getOptimalMemOpType - Returns the target specific optimal type for load
1073/// and store operations as a result of memset, memcpy, and memmove
1074/// lowering. If DstAlign is zero that means it's safe to destination
1075/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1076/// means there isn't a need to check it against alignment requirement,
1077/// probably because the source does not need to be loaded. If
1078/// 'NonScalarIntSafe' is true, that means it's safe to return a
1079/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1080/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1081/// constant so it does not need to be loaded.
1082/// It returns EVT::Other if the type should be determined using generic
1083/// target-independent logic.
1084EVT
1085X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1086                                       unsigned DstAlign, unsigned SrcAlign,
1087                                       bool NonScalarIntSafe,
1088                                       bool MemcpyStrSrc,
1089                                       MachineFunction &MF) const {
1090  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1091  // linux.  This is because the stack realignment code can't handle certain
1092  // cases like PR2962.  This should be removed when PR2962 is fixed.
1093  const Function *F = MF.getFunction();
1094  if (NonScalarIntSafe &&
1095      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1096    if (Size >= 16 &&
1097        (Subtarget->isUnalignedMemAccessFast() ||
1098         ((DstAlign == 0 || DstAlign >= 16) &&
1099          (SrcAlign == 0 || SrcAlign >= 16))) &&
1100        Subtarget->getStackAlignment() >= 16) {
1101      if (Subtarget->hasSSE2())
1102        return MVT::v4i32;
1103      if (Subtarget->hasSSE1())
1104        return MVT::v4f32;
1105    } else if (!MemcpyStrSrc && Size >= 8 &&
1106               !Subtarget->is64Bit() &&
1107               Subtarget->getStackAlignment() >= 8 &&
1108               Subtarget->hasSSE2()) {
1109      // Do not use f64 to lower memcpy if source is string constant. It's
1110      // better to use i32 to avoid the loads.
1111      return MVT::f64;
1112    }
1113  }
1114  if (Subtarget->is64Bit() && Size >= 8)
1115    return MVT::i64;
1116  return MVT::i32;
1117}
1118
1119/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1120/// current function.  The returned value is a member of the
1121/// MachineJumpTableInfo::JTEntryKind enum.
1122unsigned X86TargetLowering::getJumpTableEncoding() const {
1123  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1124  // symbol.
1125  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1126      Subtarget->isPICStyleGOT())
1127    return MachineJumpTableInfo::EK_Custom32;
1128
1129  // Otherwise, use the normal jump table encoding heuristics.
1130  return TargetLowering::getJumpTableEncoding();
1131}
1132
1133/// getPICBaseSymbol - Return the X86-32 PIC base.
1134MCSymbol *
1135X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
1136                                    MCContext &Ctx) const {
1137  const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
1138  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
1139                               Twine(MF->getFunctionNumber())+"$pb");
1140}
1141
1142
1143const MCExpr *
1144X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1145                                             const MachineBasicBlock *MBB,
1146                                             unsigned uid,MCContext &Ctx) const{
1147  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1148         Subtarget->isPICStyleGOT());
1149  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1150  // entries.
1151  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1152                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1153}
1154
1155/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1156/// jumptable.
1157SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1158                                                    SelectionDAG &DAG) const {
1159  if (!Subtarget->is64Bit())
1160    // This doesn't have DebugLoc associated with it, but is not really the
1161    // same as a Register.
1162    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1163  return Table;
1164}
1165
1166/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1167/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1168/// MCExpr.
1169const MCExpr *X86TargetLowering::
1170getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1171                             MCContext &Ctx) const {
1172  // X86-64 uses RIP relative addressing based on the jump table label.
1173  if (Subtarget->isPICStyleRIPRel())
1174    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1175
1176  // Otherwise, the reference is relative to the PIC base.
1177  return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
1178}
1179
1180/// getFunctionAlignment - Return the Log2 alignment of this function.
1181unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1182  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
1183}
1184
1185//===----------------------------------------------------------------------===//
1186//               Return Value Calling Convention Implementation
1187//===----------------------------------------------------------------------===//
1188
1189#include "X86GenCallingConv.inc"
1190
1191bool
1192X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
1193                        const SmallVectorImpl<EVT> &OutTys,
1194                        const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
1195                        SelectionDAG &DAG) {
1196  SmallVector<CCValAssign, 16> RVLocs;
1197  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1198                 RVLocs, *DAG.getContext());
1199  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
1200}
1201
1202SDValue
1203X86TargetLowering::LowerReturn(SDValue Chain,
1204                               CallingConv::ID CallConv, bool isVarArg,
1205                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1206                               DebugLoc dl, SelectionDAG &DAG) {
1207
1208  SmallVector<CCValAssign, 16> RVLocs;
1209  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1210                 RVLocs, *DAG.getContext());
1211  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1212
1213  // Add the regs to the liveout set for the function.
1214  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1215  for (unsigned i = 0; i != RVLocs.size(); ++i)
1216    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1217      MRI.addLiveOut(RVLocs[i].getLocReg());
1218
1219  SDValue Flag;
1220
1221  SmallVector<SDValue, 6> RetOps;
1222  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1223  // Operand #1 = Bytes To Pop
1224  RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16));
1225
1226  // Copy the result values into the output registers.
1227  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1228    CCValAssign &VA = RVLocs[i];
1229    assert(VA.isRegLoc() && "Can only return in registers!");
1230    SDValue ValToCopy = Outs[i].Val;
1231
1232    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1233    // the RET instruction and handled by the FP Stackifier.
1234    if (VA.getLocReg() == X86::ST0 ||
1235        VA.getLocReg() == X86::ST1) {
1236      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1237      // change the value to the FP stack register class.
1238      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1239        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1240      RetOps.push_back(ValToCopy);
1241      // Don't emit a copytoreg.
1242      continue;
1243    }
1244
1245    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1246    // which is returned in RAX / RDX.
1247    if (Subtarget->is64Bit()) {
1248      EVT ValVT = ValToCopy.getValueType();
1249      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1250        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1251        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1252          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
1253      }
1254    }
1255
1256    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1257    Flag = Chain.getValue(1);
1258  }
1259
1260  // The x86-64 ABI for returning structs by value requires that we copy
1261  // the sret argument into %rax for the return. We saved the argument into
1262  // a virtual register in the entry block, so now we copy the value out
1263  // and into %rax.
1264  if (Subtarget->is64Bit() &&
1265      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1266    MachineFunction &MF = DAG.getMachineFunction();
1267    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1268    unsigned Reg = FuncInfo->getSRetReturnReg();
1269    if (!Reg) {
1270      Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64));
1271      FuncInfo->setSRetReturnReg(Reg);
1272    }
1273    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1274
1275    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1276    Flag = Chain.getValue(1);
1277
1278    // RAX now acts like a return value.
1279    MRI.addLiveOut(X86::RAX);
1280  }
1281
1282  RetOps[0] = Chain;  // Update chain.
1283
1284  // Add the flag if we have it.
1285  if (Flag.getNode())
1286    RetOps.push_back(Flag);
1287
1288  return DAG.getNode(X86ISD::RET_FLAG, dl,
1289                     MVT::Other, &RetOps[0], RetOps.size());
1290}
1291
1292/// LowerCallResult - Lower the result values of a call into the
1293/// appropriate copies out of appropriate physical registers.
1294///
1295SDValue
1296X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1297                                   CallingConv::ID CallConv, bool isVarArg,
1298                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1299                                   DebugLoc dl, SelectionDAG &DAG,
1300                                   SmallVectorImpl<SDValue> &InVals) {
1301
1302  // Assign locations to each value returned by this call.
1303  SmallVector<CCValAssign, 16> RVLocs;
1304  bool Is64Bit = Subtarget->is64Bit();
1305  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1306                 RVLocs, *DAG.getContext());
1307  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1308
1309  // Copy all of the result registers out of their specified physreg.
1310  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1311    CCValAssign &VA = RVLocs[i];
1312    EVT CopyVT = VA.getValVT();
1313
1314    // If this is x86-64, and we disabled SSE, we can't return FP values
1315    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1316        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1317      report_fatal_error("SSE register return with SSE disabled");
1318    }
1319
1320    // If this is a call to a function that returns an fp value on the floating
1321    // point stack, but where we prefer to use the value in xmm registers, copy
1322    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1323    if ((VA.getLocReg() == X86::ST0 ||
1324         VA.getLocReg() == X86::ST1) &&
1325        isScalarFPTypeInSSEReg(VA.getValVT())) {
1326      CopyVT = MVT::f80;
1327    }
1328
1329    SDValue Val;
1330    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1331      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1332      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1333        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1334                                   MVT::v2i64, InFlag).getValue(1);
1335        Val = Chain.getValue(0);
1336        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1337                          Val, DAG.getConstant(0, MVT::i64));
1338      } else {
1339        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1340                                   MVT::i64, InFlag).getValue(1);
1341        Val = Chain.getValue(0);
1342      }
1343      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1344    } else {
1345      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1346                                 CopyVT, InFlag).getValue(1);
1347      Val = Chain.getValue(0);
1348    }
1349    InFlag = Chain.getValue(2);
1350
1351    if (CopyVT != VA.getValVT()) {
1352      // Round the F80 the right size, which also moves to the appropriate xmm
1353      // register.
1354      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1355                        // This truncation won't change the value.
1356                        DAG.getIntPtrConstant(1));
1357    }
1358
1359    InVals.push_back(Val);
1360  }
1361
1362  return Chain;
1363}
1364
1365
1366//===----------------------------------------------------------------------===//
1367//                C & StdCall & Fast Calling Convention implementation
1368//===----------------------------------------------------------------------===//
1369//  StdCall calling convention seems to be standard for many Windows' API
1370//  routines and around. It differs from C calling convention just a little:
1371//  callee should clean up the stack, not caller. Symbols should be also
1372//  decorated in some fancy way :) It doesn't support any vector arguments.
1373//  For info on fast calling convention see Fast Calling Convention (tail call)
1374//  implementation LowerX86_32FastCCCallTo.
1375
1376/// CallIsStructReturn - Determines whether a call uses struct return
1377/// semantics.
1378static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1379  if (Outs.empty())
1380    return false;
1381
1382  return Outs[0].Flags.isSRet();
1383}
1384
1385/// ArgsAreStructReturn - Determines whether a function uses struct
1386/// return semantics.
1387static bool
1388ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1389  if (Ins.empty())
1390    return false;
1391
1392  return Ins[0].Flags.isSRet();
1393}
1394
1395/// IsCalleePop - Determines whether the callee is required to pop its
1396/// own arguments. Callee pop is necessary to support tail calls.
1397bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){
1398  if (IsVarArg)
1399    return false;
1400
1401  switch (CallingConv) {
1402  default:
1403    return false;
1404  case CallingConv::X86_StdCall:
1405    return !Subtarget->is64Bit();
1406  case CallingConv::X86_FastCall:
1407    return !Subtarget->is64Bit();
1408  case CallingConv::Fast:
1409    return GuaranteedTailCallOpt;
1410  case CallingConv::GHC:
1411    return GuaranteedTailCallOpt;
1412  }
1413}
1414
1415/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1416/// given CallingConvention value.
1417CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1418  if (Subtarget->is64Bit()) {
1419    if (CC == CallingConv::GHC)
1420      return CC_X86_64_GHC;
1421    else if (Subtarget->isTargetWin64())
1422      return CC_X86_Win64_C;
1423    else
1424      return CC_X86_64_C;
1425  }
1426
1427  if (CC == CallingConv::X86_FastCall)
1428    return CC_X86_32_FastCall;
1429  else if (CC == CallingConv::Fast)
1430    return CC_X86_32_FastCC;
1431  else if (CC == CallingConv::GHC)
1432    return CC_X86_32_GHC;
1433  else
1434    return CC_X86_32_C;
1435}
1436
1437/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1438/// by "Src" to address "Dst" with size and alignment information specified by
1439/// the specific parameter attribute. The copy will be passed as a byval
1440/// function parameter.
1441static SDValue
1442CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1443                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1444                          DebugLoc dl) {
1445  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1446  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1447                       /*isVolatile*/false, /*AlwaysInline=*/true,
1448                       NULL, 0, NULL, 0);
1449}
1450
1451/// IsTailCallConvention - Return true if the calling convention is one that
1452/// supports tail call optimization.
1453static bool IsTailCallConvention(CallingConv::ID CC) {
1454  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1455}
1456
1457/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1458/// a tailcall target by changing its ABI.
1459static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
1460  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1461}
1462
1463SDValue
1464X86TargetLowering::LowerMemArgument(SDValue Chain,
1465                                    CallingConv::ID CallConv,
1466                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1467                                    DebugLoc dl, SelectionDAG &DAG,
1468                                    const CCValAssign &VA,
1469                                    MachineFrameInfo *MFI,
1470                                    unsigned i) {
1471  // Create the nodes corresponding to a load from this parameter slot.
1472  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1473  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
1474  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1475  EVT ValVT;
1476
1477  // If value is passed by pointer we have address passed instead of the value
1478  // itself.
1479  if (VA.getLocInfo() == CCValAssign::Indirect)
1480    ValVT = VA.getLocVT();
1481  else
1482    ValVT = VA.getValVT();
1483
1484  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1485  // changed with more analysis.
1486  // In case of tail call optimization mark all arguments mutable. Since they
1487  // could be overwritten by lowering of arguments in case of a tail call.
1488  if (Flags.isByVal()) {
1489    int FI = MFI->CreateFixedObject(Flags.getByValSize(),
1490                                    VA.getLocMemOffset(), isImmutable, false);
1491    return DAG.getFrameIndex(FI, getPointerTy());
1492  } else {
1493    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1494                                    VA.getLocMemOffset(), isImmutable, false);
1495    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1496    return DAG.getLoad(ValVT, dl, Chain, FIN,
1497                       PseudoSourceValue::getFixedStack(FI), 0,
1498                       false, false, 0);
1499  }
1500}
1501
1502SDValue
1503X86TargetLowering::LowerFormalArguments(SDValue Chain,
1504                                        CallingConv::ID CallConv,
1505                                        bool isVarArg,
1506                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1507                                        DebugLoc dl,
1508                                        SelectionDAG &DAG,
1509                                        SmallVectorImpl<SDValue> &InVals) {
1510  MachineFunction &MF = DAG.getMachineFunction();
1511  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1512
1513  const Function* Fn = MF.getFunction();
1514  if (Fn->hasExternalLinkage() &&
1515      Subtarget->isTargetCygMing() &&
1516      Fn->getName() == "main")
1517    FuncInfo->setForceFramePointer(true);
1518
1519  MachineFrameInfo *MFI = MF.getFrameInfo();
1520  bool Is64Bit = Subtarget->is64Bit();
1521  bool IsWin64 = Subtarget->isTargetWin64();
1522
1523  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1524         "Var args not supported with calling convention fastcc or ghc");
1525
1526  // Assign locations to all of the incoming arguments.
1527  SmallVector<CCValAssign, 16> ArgLocs;
1528  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1529                 ArgLocs, *DAG.getContext());
1530  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1531
1532  unsigned LastVal = ~0U;
1533  SDValue ArgValue;
1534  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1535    CCValAssign &VA = ArgLocs[i];
1536    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1537    // places.
1538    assert(VA.getValNo() != LastVal &&
1539           "Don't support value assigned to multiple locs yet");
1540    LastVal = VA.getValNo();
1541
1542    if (VA.isRegLoc()) {
1543      EVT RegVT = VA.getLocVT();
1544      TargetRegisterClass *RC = NULL;
1545      if (RegVT == MVT::i32)
1546        RC = X86::GR32RegisterClass;
1547      else if (Is64Bit && RegVT == MVT::i64)
1548        RC = X86::GR64RegisterClass;
1549      else if (RegVT == MVT::f32)
1550        RC = X86::FR32RegisterClass;
1551      else if (RegVT == MVT::f64)
1552        RC = X86::FR64RegisterClass;
1553      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1554        RC = X86::VR128RegisterClass;
1555      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1556        RC = X86::VR64RegisterClass;
1557      else
1558        llvm_unreachable("Unknown argument type!");
1559
1560      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1561      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1562
1563      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1564      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1565      // right size.
1566      if (VA.getLocInfo() == CCValAssign::SExt)
1567        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1568                               DAG.getValueType(VA.getValVT()));
1569      else if (VA.getLocInfo() == CCValAssign::ZExt)
1570        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1571                               DAG.getValueType(VA.getValVT()));
1572      else if (VA.getLocInfo() == CCValAssign::BCvt)
1573        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1574
1575      if (VA.isExtInLoc()) {
1576        // Handle MMX values passed in XMM regs.
1577        if (RegVT.isVector()) {
1578          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1579                                 ArgValue, DAG.getConstant(0, MVT::i64));
1580          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1581        } else
1582          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1583      }
1584    } else {
1585      assert(VA.isMemLoc());
1586      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1587    }
1588
1589    // If value is passed via pointer - do a load.
1590    if (VA.getLocInfo() == CCValAssign::Indirect)
1591      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
1592                             false, false, 0);
1593
1594    InVals.push_back(ArgValue);
1595  }
1596
1597  // The x86-64 ABI for returning structs by value requires that we copy
1598  // the sret argument into %rax for the return. Save the argument into
1599  // a virtual register so that we can access it from the return points.
1600  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1601    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1602    unsigned Reg = FuncInfo->getSRetReturnReg();
1603    if (!Reg) {
1604      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1605      FuncInfo->setSRetReturnReg(Reg);
1606    }
1607    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1608    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1609  }
1610
1611  unsigned StackSize = CCInfo.getNextStackOffset();
1612  // Align stack specially for tail calls.
1613  if (FuncIsMadeTailCallSafe(CallConv))
1614    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1615
1616  // If the function takes variable number of arguments, make a frame index for
1617  // the start of the first vararg value... for expansion of llvm.va_start.
1618  if (isVarArg) {
1619    if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
1620      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false);
1621    }
1622    if (Is64Bit) {
1623      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1624
1625      // FIXME: We should really autogenerate these arrays
1626      static const unsigned GPR64ArgRegsWin64[] = {
1627        X86::RCX, X86::RDX, X86::R8,  X86::R9
1628      };
1629      static const unsigned XMMArgRegsWin64[] = {
1630        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1631      };
1632      static const unsigned GPR64ArgRegs64Bit[] = {
1633        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1634      };
1635      static const unsigned XMMArgRegs64Bit[] = {
1636        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1637        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1638      };
1639      const unsigned *GPR64ArgRegs, *XMMArgRegs;
1640
1641      if (IsWin64) {
1642        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1643        GPR64ArgRegs = GPR64ArgRegsWin64;
1644        XMMArgRegs = XMMArgRegsWin64;
1645      } else {
1646        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1647        GPR64ArgRegs = GPR64ArgRegs64Bit;
1648        XMMArgRegs = XMMArgRegs64Bit;
1649      }
1650      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1651                                                       TotalNumIntRegs);
1652      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1653                                                       TotalNumXMMRegs);
1654
1655      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1656      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1657             "SSE register cannot be used when SSE is disabled!");
1658      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1659             "SSE register cannot be used when SSE is disabled!");
1660      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1661        // Kernel mode asks for SSE to be disabled, so don't push them
1662        // on the stack.
1663        TotalNumXMMRegs = 0;
1664
1665      // For X86-64, if there are vararg parameters that are passed via
1666      // registers, then we must store them to their spots on the stack so they
1667      // may be loaded by deferencing the result of va_next.
1668      VarArgsGPOffset = NumIntRegs * 8;
1669      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1670      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1671                                                 TotalNumXMMRegs * 16, 16,
1672                                                 false);
1673
1674      // Store the integer parameter registers.
1675      SmallVector<SDValue, 8> MemOps;
1676      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1677      unsigned Offset = VarArgsGPOffset;
1678      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1679        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1680                                  DAG.getIntPtrConstant(Offset));
1681        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1682                                     X86::GR64RegisterClass);
1683        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1684        SDValue Store =
1685          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1686                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
1687                       Offset, false, false, 0);
1688        MemOps.push_back(Store);
1689        Offset += 8;
1690      }
1691
1692      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1693        // Now store the XMM (fp + vector) parameter registers.
1694        SmallVector<SDValue, 11> SaveXMMOps;
1695        SaveXMMOps.push_back(Chain);
1696
1697        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1698        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1699        SaveXMMOps.push_back(ALVal);
1700
1701        SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex));
1702        SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset));
1703
1704        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1705          unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1706                                       X86::VR128RegisterClass);
1707          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1708          SaveXMMOps.push_back(Val);
1709        }
1710        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1711                                     MVT::Other,
1712                                     &SaveXMMOps[0], SaveXMMOps.size()));
1713      }
1714
1715      if (!MemOps.empty())
1716        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1717                            &MemOps[0], MemOps.size());
1718    }
1719  }
1720
1721  // Some CCs need callee pop.
1722  if (IsCalleePop(isVarArg, CallConv)) {
1723    BytesToPopOnReturn  = StackSize; // Callee pops everything.
1724  } else {
1725    BytesToPopOnReturn  = 0; // Callee pops nothing.
1726    // If this is an sret function, the return should pop the hidden pointer.
1727    if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
1728      BytesToPopOnReturn = 4;
1729  }
1730
1731  if (!Is64Bit) {
1732    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
1733    if (CallConv == CallingConv::X86_FastCall)
1734      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
1735  }
1736
1737  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1738
1739  return Chain;
1740}
1741
1742SDValue
1743X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1744                                    SDValue StackPtr, SDValue Arg,
1745                                    DebugLoc dl, SelectionDAG &DAG,
1746                                    const CCValAssign &VA,
1747                                    ISD::ArgFlagsTy Flags) {
1748  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1749  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1750  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1751  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1752  if (Flags.isByVal()) {
1753    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1754  }
1755  return DAG.getStore(Chain, dl, Arg, PtrOff,
1756                      PseudoSourceValue::getStack(), LocMemOffset,
1757                      false, false, 0);
1758}
1759
1760/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1761/// optimization is performed and it is required.
1762SDValue
1763X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1764                                           SDValue &OutRetAddr, SDValue Chain,
1765                                           bool IsTailCall, bool Is64Bit,
1766                                           int FPDiff, DebugLoc dl) {
1767  // Adjust the Return address stack slot.
1768  EVT VT = getPointerTy();
1769  OutRetAddr = getReturnAddressFrameIndex(DAG);
1770
1771  // Load the "old" Return address.
1772  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
1773  return SDValue(OutRetAddr.getNode(), 1);
1774}
1775
1776/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1777/// optimization is performed and it is required (FPDiff!=0).
1778static SDValue
1779EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1780                         SDValue Chain, SDValue RetAddrFrIdx,
1781                         bool Is64Bit, int FPDiff, DebugLoc dl) {
1782  // Store the return address to the appropriate stack slot.
1783  if (!FPDiff) return Chain;
1784  // Calculate the new stack slot for the return address.
1785  int SlotSize = Is64Bit ? 8 : 4;
1786  int NewReturnAddrFI =
1787    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
1788  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1789  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1790  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1791                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
1792                       false, false, 0);
1793  return Chain;
1794}
1795
1796SDValue
1797X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1798                             CallingConv::ID CallConv, bool isVarArg,
1799                             bool &isTailCall,
1800                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1801                             const SmallVectorImpl<ISD::InputArg> &Ins,
1802                             DebugLoc dl, SelectionDAG &DAG,
1803                             SmallVectorImpl<SDValue> &InVals) {
1804  MachineFunction &MF = DAG.getMachineFunction();
1805  bool Is64Bit        = Subtarget->is64Bit();
1806  bool IsStructRet    = CallIsStructReturn(Outs);
1807  bool IsSibcall      = false;
1808
1809  if (isTailCall) {
1810    // Check if it's really possible to do a tail call.
1811    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1812                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1813                                                   Outs, Ins, DAG);
1814
1815    // Sibcalls are automatically detected tailcalls which do not require
1816    // ABI changes.
1817    if (!GuaranteedTailCallOpt && isTailCall)
1818      IsSibcall = true;
1819
1820    if (isTailCall)
1821      ++NumTailCalls;
1822  }
1823
1824  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1825         "Var args not supported with calling convention fastcc or ghc");
1826
1827  // Analyze operands of the call, assigning locations to each operand.
1828  SmallVector<CCValAssign, 16> ArgLocs;
1829  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1830                 ArgLocs, *DAG.getContext());
1831  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1832
1833  // Get a count of how many bytes are to be pushed on the stack.
1834  unsigned NumBytes = CCInfo.getNextStackOffset();
1835  if (IsSibcall)
1836    // This is a sibcall. The memory operands are available in caller's
1837    // own caller's stack.
1838    NumBytes = 0;
1839  else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
1840    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1841
1842  int FPDiff = 0;
1843  if (isTailCall && !IsSibcall) {
1844    // Lower arguments at fp - stackoffset + fpdiff.
1845    unsigned NumBytesCallerPushed =
1846      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1847    FPDiff = NumBytesCallerPushed - NumBytes;
1848
1849    // Set the delta of movement of the returnaddr stackslot.
1850    // But only set if delta is greater than previous delta.
1851    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1852      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1853  }
1854
1855  if (!IsSibcall)
1856    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1857
1858  SDValue RetAddrFrIdx;
1859  // Load return adress for tail calls.
1860  if (isTailCall && FPDiff)
1861    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
1862                                    Is64Bit, FPDiff, dl);
1863
1864  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1865  SmallVector<SDValue, 8> MemOpChains;
1866  SDValue StackPtr;
1867
1868  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1869  // of tail call optimization arguments are handle later.
1870  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1871    CCValAssign &VA = ArgLocs[i];
1872    EVT RegVT = VA.getLocVT();
1873    SDValue Arg = Outs[i].Val;
1874    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1875    bool isByVal = Flags.isByVal();
1876
1877    // Promote the value if needed.
1878    switch (VA.getLocInfo()) {
1879    default: llvm_unreachable("Unknown loc info!");
1880    case CCValAssign::Full: break;
1881    case CCValAssign::SExt:
1882      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1883      break;
1884    case CCValAssign::ZExt:
1885      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1886      break;
1887    case CCValAssign::AExt:
1888      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1889        // Special case: passing MMX values in XMM registers.
1890        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1891        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1892        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1893      } else
1894        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1895      break;
1896    case CCValAssign::BCvt:
1897      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1898      break;
1899    case CCValAssign::Indirect: {
1900      // Store the argument.
1901      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1902      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1903      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1904                           PseudoSourceValue::getFixedStack(FI), 0,
1905                           false, false, 0);
1906      Arg = SpillSlot;
1907      break;
1908    }
1909    }
1910
1911    if (VA.isRegLoc()) {
1912      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1913    } else if (!IsSibcall && (!isTailCall || isByVal)) {
1914      assert(VA.isMemLoc());
1915      if (StackPtr.getNode() == 0)
1916        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1917      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1918                                             dl, DAG, VA, Flags));
1919    }
1920  }
1921
1922  if (!MemOpChains.empty())
1923    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1924                        &MemOpChains[0], MemOpChains.size());
1925
1926  // Build a sequence of copy-to-reg nodes chained together with token chain
1927  // and flag operands which copy the outgoing args into registers.
1928  SDValue InFlag;
1929  // Tail call byval lowering might overwrite argument registers so in case of
1930  // tail call optimization the copies to registers are lowered later.
1931  if (!isTailCall)
1932    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1933      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1934                               RegsToPass[i].second, InFlag);
1935      InFlag = Chain.getValue(1);
1936    }
1937
1938  if (Subtarget->isPICStyleGOT()) {
1939    // ELF / PIC requires GOT in the EBX register before function calls via PLT
1940    // GOT pointer.
1941    if (!isTailCall) {
1942      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1943                               DAG.getNode(X86ISD::GlobalBaseReg,
1944                                           DebugLoc(), getPointerTy()),
1945                               InFlag);
1946      InFlag = Chain.getValue(1);
1947    } else {
1948      // If we are tail calling and generating PIC/GOT style code load the
1949      // address of the callee into ECX. The value in ecx is used as target of
1950      // the tail jump. This is done to circumvent the ebx/callee-saved problem
1951      // for tail calls on PIC/GOT architectures. Normally we would just put the
1952      // address of GOT into ebx and then call target@PLT. But for tail calls
1953      // ebx would be restored (since ebx is callee saved) before jumping to the
1954      // target@PLT.
1955
1956      // Note: The actual moving to ECX is done further down.
1957      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1958      if (G && !G->getGlobal()->hasHiddenVisibility() &&
1959          !G->getGlobal()->hasProtectedVisibility())
1960        Callee = LowerGlobalAddress(Callee, DAG);
1961      else if (isa<ExternalSymbolSDNode>(Callee))
1962        Callee = LowerExternalSymbol(Callee, DAG);
1963    }
1964  }
1965
1966  if (Is64Bit && isVarArg) {
1967    // From AMD64 ABI document:
1968    // For calls that may call functions that use varargs or stdargs
1969    // (prototype-less calls or calls to functions containing ellipsis (...) in
1970    // the declaration) %al is used as hidden argument to specify the number
1971    // of SSE registers used. The contents of %al do not need to match exactly
1972    // the number of registers, but must be an ubound on the number of SSE
1973    // registers used and is in the range 0 - 8 inclusive.
1974
1975    // FIXME: Verify this on Win64
1976    // Count the number of XMM registers allocated.
1977    static const unsigned XMMArgRegs[] = {
1978      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1979      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1980    };
1981    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1982    assert((Subtarget->hasSSE1() || !NumXMMRegs)
1983           && "SSE registers cannot be used when SSE is disabled");
1984
1985    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1986                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1987    InFlag = Chain.getValue(1);
1988  }
1989
1990
1991  // For tail calls lower the arguments to the 'real' stack slot.
1992  if (isTailCall) {
1993    // Force all the incoming stack arguments to be loaded from the stack
1994    // before any new outgoing arguments are stored to the stack, because the
1995    // outgoing stack slots may alias the incoming argument stack slots, and
1996    // the alias isn't otherwise explicit. This is slightly more conservative
1997    // than necessary, because it means that each store effectively depends
1998    // on every argument instead of just those arguments it would clobber.
1999    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2000
2001    SmallVector<SDValue, 8> MemOpChains2;
2002    SDValue FIN;
2003    int FI = 0;
2004    // Do not flag preceeding copytoreg stuff together with the following stuff.
2005    InFlag = SDValue();
2006    if (GuaranteedTailCallOpt) {
2007      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2008        CCValAssign &VA = ArgLocs[i];
2009        if (VA.isRegLoc())
2010          continue;
2011        assert(VA.isMemLoc());
2012        SDValue Arg = Outs[i].Val;
2013        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2014        // Create frame index.
2015        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2016        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2017        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
2018        FIN = DAG.getFrameIndex(FI, getPointerTy());
2019
2020        if (Flags.isByVal()) {
2021          // Copy relative to framepointer.
2022          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2023          if (StackPtr.getNode() == 0)
2024            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2025                                          getPointerTy());
2026          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2027
2028          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2029                                                           ArgChain,
2030                                                           Flags, DAG, dl));
2031        } else {
2032          // Store relative to framepointer.
2033          MemOpChains2.push_back(
2034            DAG.getStore(ArgChain, dl, Arg, FIN,
2035                         PseudoSourceValue::getFixedStack(FI), 0,
2036                         false, false, 0));
2037        }
2038      }
2039    }
2040
2041    if (!MemOpChains2.empty())
2042      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2043                          &MemOpChains2[0], MemOpChains2.size());
2044
2045    // Copy arguments to their registers.
2046    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2047      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2048                               RegsToPass[i].second, InFlag);
2049      InFlag = Chain.getValue(1);
2050    }
2051    InFlag =SDValue();
2052
2053    // Store the return address to the appropriate stack slot.
2054    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2055                                     FPDiff, dl);
2056  }
2057
2058  bool WasGlobalOrExternal = false;
2059  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2060    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2061    // In the 64-bit large code model, we have to make all calls
2062    // through a register, since the call instruction's 32-bit
2063    // pc-relative offset may not be large enough to hold the whole
2064    // address.
2065  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2066    WasGlobalOrExternal = true;
2067    // If the callee is a GlobalAddress node (quite common, every direct call
2068    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2069    // it.
2070
2071    // We should use extra load for direct calls to dllimported functions in
2072    // non-JIT mode.
2073    const GlobalValue *GV = G->getGlobal();
2074    if (!GV->hasDLLImportLinkage()) {
2075      unsigned char OpFlags = 0;
2076
2077      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2078      // external symbols most go through the PLT in PIC mode.  If the symbol
2079      // has hidden or protected visibility, or if it is static or local, then
2080      // we don't need to use the PLT - we can directly call it.
2081      if (Subtarget->isTargetELF() &&
2082          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2083          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2084        OpFlags = X86II::MO_PLT;
2085      } else if (Subtarget->isPICStyleStubAny() &&
2086               (GV->isDeclaration() || GV->isWeakForLinker()) &&
2087               Subtarget->getDarwinVers() < 9) {
2088        // PC-relative references to external symbols should go through $stub,
2089        // unless we're building with the leopard linker or later, which
2090        // automatically synthesizes these stubs.
2091        OpFlags = X86II::MO_DARWIN_STUB;
2092      }
2093
2094      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
2095                                          G->getOffset(), OpFlags);
2096    }
2097  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2098    WasGlobalOrExternal = true;
2099    unsigned char OpFlags = 0;
2100
2101    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
2102    // symbols should go through the PLT.
2103    if (Subtarget->isTargetELF() &&
2104        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2105      OpFlags = X86II::MO_PLT;
2106    } else if (Subtarget->isPICStyleStubAny() &&
2107             Subtarget->getDarwinVers() < 9) {
2108      // PC-relative references to external symbols should go through $stub,
2109      // unless we're building with the leopard linker or later, which
2110      // automatically synthesizes these stubs.
2111      OpFlags = X86II::MO_DARWIN_STUB;
2112    }
2113
2114    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2115                                         OpFlags);
2116  }
2117
2118  // Returns a chain & a flag for retval copy to use.
2119  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
2120  SmallVector<SDValue, 8> Ops;
2121
2122  if (!IsSibcall && isTailCall) {
2123    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2124                           DAG.getIntPtrConstant(0, true), InFlag);
2125    InFlag = Chain.getValue(1);
2126  }
2127
2128  Ops.push_back(Chain);
2129  Ops.push_back(Callee);
2130
2131  if (isTailCall)
2132    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2133
2134  // Add argument registers to the end of the list so that they are known live
2135  // into the call.
2136  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2137    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2138                                  RegsToPass[i].second.getValueType()));
2139
2140  // Add an implicit use GOT pointer in EBX.
2141  if (!isTailCall && Subtarget->isPICStyleGOT())
2142    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2143
2144  // Add an implicit use of AL for x86 vararg functions.
2145  if (Is64Bit && isVarArg)
2146    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2147
2148  if (InFlag.getNode())
2149    Ops.push_back(InFlag);
2150
2151  if (isTailCall) {
2152    // If this is the first return lowered for this function, add the regs
2153    // to the liveout set for the function.
2154    if (MF.getRegInfo().liveout_empty()) {
2155      SmallVector<CCValAssign, 16> RVLocs;
2156      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
2157                     *DAG.getContext());
2158      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2159      for (unsigned i = 0; i != RVLocs.size(); ++i)
2160        if (RVLocs[i].isRegLoc())
2161          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
2162    }
2163    return DAG.getNode(X86ISD::TC_RETURN, dl,
2164                       NodeTys, &Ops[0], Ops.size());
2165  }
2166
2167  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2168  InFlag = Chain.getValue(1);
2169
2170  // Create the CALLSEQ_END node.
2171  unsigned NumBytesForCalleeToPush;
2172  if (IsCalleePop(isVarArg, CallConv))
2173    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2174  else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
2175    // If this is a call to a struct-return function, the callee
2176    // pops the hidden struct pointer, so we have to push it back.
2177    // This is common for Darwin/X86, Linux & Mingw32 targets.
2178    NumBytesForCalleeToPush = 4;
2179  else
2180    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2181
2182  // Returns a flag for retval copy to use.
2183  if (!IsSibcall) {
2184    Chain = DAG.getCALLSEQ_END(Chain,
2185                               DAG.getIntPtrConstant(NumBytes, true),
2186                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2187                                                     true),
2188                               InFlag);
2189    InFlag = Chain.getValue(1);
2190  }
2191
2192  // Handle result values, copying them out of physregs into vregs that we
2193  // return.
2194  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2195                         Ins, dl, DAG, InVals);
2196}
2197
2198
2199//===----------------------------------------------------------------------===//
2200//                Fast Calling Convention (tail call) implementation
2201//===----------------------------------------------------------------------===//
2202
2203//  Like std call, callee cleans arguments, convention except that ECX is
2204//  reserved for storing the tail called function address. Only 2 registers are
2205//  free for argument passing (inreg). Tail call optimization is performed
2206//  provided:
2207//                * tailcallopt is enabled
2208//                * caller/callee are fastcc
2209//  On X86_64 architecture with GOT-style position independent code only local
2210//  (within module) calls are supported at the moment.
2211//  To keep the stack aligned according to platform abi the function
2212//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2213//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2214//  If a tail called function callee has more arguments than the caller the
2215//  caller needs to make sure that there is room to move the RETADDR to. This is
2216//  achieved by reserving an area the size of the argument delta right after the
2217//  original REtADDR, but before the saved framepointer or the spilled registers
2218//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2219//  stack layout:
2220//    arg1
2221//    arg2
2222//    RETADDR
2223//    [ new RETADDR
2224//      move area ]
2225//    (possible EBP)
2226//    ESI
2227//    EDI
2228//    local1 ..
2229
2230/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2231/// for a 16 byte align requirement.
2232unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2233                                                        SelectionDAG& DAG) {
2234  MachineFunction &MF = DAG.getMachineFunction();
2235  const TargetMachine &TM = MF.getTarget();
2236  const TargetFrameInfo &TFI = *TM.getFrameInfo();
2237  unsigned StackAlignment = TFI.getStackAlignment();
2238  uint64_t AlignMask = StackAlignment - 1;
2239  int64_t Offset = StackSize;
2240  uint64_t SlotSize = TD->getPointerSize();
2241  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2242    // Number smaller than 12 so just add the difference.
2243    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2244  } else {
2245    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2246    Offset = ((~AlignMask) & Offset) + StackAlignment +
2247      (StackAlignment-SlotSize);
2248  }
2249  return Offset;
2250}
2251
2252/// MatchingStackOffset - Return true if the given stack call argument is
2253/// already available in the same position (relatively) of the caller's
2254/// incoming argument stack.
2255static
2256bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2257                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2258                         const X86InstrInfo *TII) {
2259  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2260  int FI = INT_MAX;
2261  if (Arg.getOpcode() == ISD::CopyFromReg) {
2262    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2263    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
2264      return false;
2265    MachineInstr *Def = MRI->getVRegDef(VR);
2266    if (!Def)
2267      return false;
2268    if (!Flags.isByVal()) {
2269      if (!TII->isLoadFromStackSlot(Def, FI))
2270        return false;
2271    } else {
2272      unsigned Opcode = Def->getOpcode();
2273      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2274          Def->getOperand(1).isFI()) {
2275        FI = Def->getOperand(1).getIndex();
2276        Bytes = Flags.getByValSize();
2277      } else
2278        return false;
2279    }
2280  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2281    if (Flags.isByVal())
2282      // ByVal argument is passed in as a pointer but it's now being
2283      // dereferenced. e.g.
2284      // define @foo(%struct.X* %A) {
2285      //   tail call @bar(%struct.X* byval %A)
2286      // }
2287      return false;
2288    SDValue Ptr = Ld->getBasePtr();
2289    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2290    if (!FINode)
2291      return false;
2292    FI = FINode->getIndex();
2293  } else
2294    return false;
2295
2296  assert(FI != INT_MAX);
2297  if (!MFI->isFixedObjectIndex(FI))
2298    return false;
2299  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2300}
2301
2302/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2303/// for tail call optimization. Targets which want to do tail call
2304/// optimization should implement this function.
2305bool
2306X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2307                                                     CallingConv::ID CalleeCC,
2308                                                     bool isVarArg,
2309                                                     bool isCalleeStructRet,
2310                                                     bool isCallerStructRet,
2311                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2312                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2313                                                     SelectionDAG& DAG) const {
2314  if (!IsTailCallConvention(CalleeCC) &&
2315      CalleeCC != CallingConv::C)
2316    return false;
2317
2318  // If -tailcallopt is specified, make fastcc functions tail-callable.
2319  const MachineFunction &MF = DAG.getMachineFunction();
2320  const Function *CallerF = DAG.getMachineFunction().getFunction();
2321  if (GuaranteedTailCallOpt) {
2322    if (IsTailCallConvention(CalleeCC) &&
2323        CallerF->getCallingConv() == CalleeCC)
2324      return true;
2325    return false;
2326  }
2327
2328  // Look for obvious safe cases to perform tail call optimization that does not
2329  // requite ABI changes. This is what gcc calls sibcall.
2330
2331  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2332  // emit a special epilogue.
2333  if (RegInfo->needsStackRealignment(MF))
2334    return false;
2335
2336  // Do not sibcall optimize vararg calls unless the call site is not passing any
2337  // arguments.
2338  if (isVarArg && !Outs.empty())
2339    return false;
2340
2341  // Also avoid sibcall optimization if either caller or callee uses struct
2342  // return semantics.
2343  if (isCalleeStructRet || isCallerStructRet)
2344    return false;
2345
2346  // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
2347  // Therefore if it's not used by the call it is not safe to optimize this into
2348  // a sibcall.
2349  bool Unused = false;
2350  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2351    if (!Ins[i].Used) {
2352      Unused = true;
2353      break;
2354    }
2355  }
2356  if (Unused) {
2357    SmallVector<CCValAssign, 16> RVLocs;
2358    CCState CCInfo(CalleeCC, false, getTargetMachine(),
2359                   RVLocs, *DAG.getContext());
2360    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2361    for (unsigned i = 0; i != RVLocs.size(); ++i) {
2362      CCValAssign &VA = RVLocs[i];
2363      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2364        return false;
2365    }
2366  }
2367
2368  // If the callee takes no arguments then go on to check the results of the
2369  // call.
2370  if (!Outs.empty()) {
2371    // Check if stack adjustment is needed. For now, do not do this if any
2372    // argument is passed on the stack.
2373    SmallVector<CCValAssign, 16> ArgLocs;
2374    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
2375                   ArgLocs, *DAG.getContext());
2376    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
2377    if (CCInfo.getNextStackOffset()) {
2378      MachineFunction &MF = DAG.getMachineFunction();
2379      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2380        return false;
2381      if (Subtarget->isTargetWin64())
2382        // Win64 ABI has additional complications.
2383        return false;
2384
2385      // Check if the arguments are already laid out in the right way as
2386      // the caller's fixed stack objects.
2387      MachineFrameInfo *MFI = MF.getFrameInfo();
2388      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2389      const X86InstrInfo *TII =
2390        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2391      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2392        CCValAssign &VA = ArgLocs[i];
2393        EVT RegVT = VA.getLocVT();
2394        SDValue Arg = Outs[i].Val;
2395        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2396        if (VA.getLocInfo() == CCValAssign::Indirect)
2397          return false;
2398        if (!VA.isRegLoc()) {
2399          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2400                                   MFI, MRI, TII))
2401            return false;
2402        }
2403      }
2404    }
2405  }
2406
2407  return true;
2408}
2409
2410FastISel *
2411X86TargetLowering::createFastISel(MachineFunction &mf,
2412                            DenseMap<const Value *, unsigned> &vm,
2413                            DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
2414                            DenseMap<const AllocaInst *, int> &am
2415#ifndef NDEBUG
2416                          , SmallSet<const Instruction *, 8> &cil
2417#endif
2418                                  ) {
2419  return X86::createFastISel(mf, vm, bm, am
2420#ifndef NDEBUG
2421                             , cil
2422#endif
2423                             );
2424}
2425
2426
2427//===----------------------------------------------------------------------===//
2428//                           Other Lowering Hooks
2429//===----------------------------------------------------------------------===//
2430
2431
2432SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
2433  MachineFunction &MF = DAG.getMachineFunction();
2434  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2435  int ReturnAddrIndex = FuncInfo->getRAIndex();
2436
2437  if (ReturnAddrIndex == 0) {
2438    // Set up a frame object for the return address.
2439    uint64_t SlotSize = TD->getPointerSize();
2440    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2441                                                           false, false);
2442    FuncInfo->setRAIndex(ReturnAddrIndex);
2443  }
2444
2445  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2446}
2447
2448
2449bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2450                                       bool hasSymbolicDisplacement) {
2451  // Offset should fit into 32 bit immediate field.
2452  if (!isInt<32>(Offset))
2453    return false;
2454
2455  // If we don't have a symbolic displacement - we don't have any extra
2456  // restrictions.
2457  if (!hasSymbolicDisplacement)
2458    return true;
2459
2460  // FIXME: Some tweaks might be needed for medium code model.
2461  if (M != CodeModel::Small && M != CodeModel::Kernel)
2462    return false;
2463
2464  // For small code model we assume that latest object is 16MB before end of 31
2465  // bits boundary. We may also accept pretty large negative constants knowing
2466  // that all objects are in the positive half of address space.
2467  if (M == CodeModel::Small && Offset < 16*1024*1024)
2468    return true;
2469
2470  // For kernel code model we know that all object resist in the negative half
2471  // of 32bits address space. We may not accept negative offsets, since they may
2472  // be just off and we may accept pretty large positive ones.
2473  if (M == CodeModel::Kernel && Offset > 0)
2474    return true;
2475
2476  return false;
2477}
2478
2479/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2480/// specific condition code, returning the condition code and the LHS/RHS of the
2481/// comparison to make.
2482static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2483                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2484  if (!isFP) {
2485    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2486      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2487        // X > -1   -> X == 0, jump !sign.
2488        RHS = DAG.getConstant(0, RHS.getValueType());
2489        return X86::COND_NS;
2490      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2491        // X < 0   -> X == 0, jump on sign.
2492        return X86::COND_S;
2493      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2494        // X < 1   -> X <= 0
2495        RHS = DAG.getConstant(0, RHS.getValueType());
2496        return X86::COND_LE;
2497      }
2498    }
2499
2500    switch (SetCCOpcode) {
2501    default: llvm_unreachable("Invalid integer condition!");
2502    case ISD::SETEQ:  return X86::COND_E;
2503    case ISD::SETGT:  return X86::COND_G;
2504    case ISD::SETGE:  return X86::COND_GE;
2505    case ISD::SETLT:  return X86::COND_L;
2506    case ISD::SETLE:  return X86::COND_LE;
2507    case ISD::SETNE:  return X86::COND_NE;
2508    case ISD::SETULT: return X86::COND_B;
2509    case ISD::SETUGT: return X86::COND_A;
2510    case ISD::SETULE: return X86::COND_BE;
2511    case ISD::SETUGE: return X86::COND_AE;
2512    }
2513  }
2514
2515  // First determine if it is required or is profitable to flip the operands.
2516
2517  // If LHS is a foldable load, but RHS is not, flip the condition.
2518  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2519      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2520    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2521    std::swap(LHS, RHS);
2522  }
2523
2524  switch (SetCCOpcode) {
2525  default: break;
2526  case ISD::SETOLT:
2527  case ISD::SETOLE:
2528  case ISD::SETUGT:
2529  case ISD::SETUGE:
2530    std::swap(LHS, RHS);
2531    break;
2532  }
2533
2534  // On a floating point condition, the flags are set as follows:
2535  // ZF  PF  CF   op
2536  //  0 | 0 | 0 | X > Y
2537  //  0 | 0 | 1 | X < Y
2538  //  1 | 0 | 0 | X == Y
2539  //  1 | 1 | 1 | unordered
2540  switch (SetCCOpcode) {
2541  default: llvm_unreachable("Condcode should be pre-legalized away");
2542  case ISD::SETUEQ:
2543  case ISD::SETEQ:   return X86::COND_E;
2544  case ISD::SETOLT:              // flipped
2545  case ISD::SETOGT:
2546  case ISD::SETGT:   return X86::COND_A;
2547  case ISD::SETOLE:              // flipped
2548  case ISD::SETOGE:
2549  case ISD::SETGE:   return X86::COND_AE;
2550  case ISD::SETUGT:              // flipped
2551  case ISD::SETULT:
2552  case ISD::SETLT:   return X86::COND_B;
2553  case ISD::SETUGE:              // flipped
2554  case ISD::SETULE:
2555  case ISD::SETLE:   return X86::COND_BE;
2556  case ISD::SETONE:
2557  case ISD::SETNE:   return X86::COND_NE;
2558  case ISD::SETUO:   return X86::COND_P;
2559  case ISD::SETO:    return X86::COND_NP;
2560  case ISD::SETOEQ:
2561  case ISD::SETUNE:  return X86::COND_INVALID;
2562  }
2563}
2564
2565/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2566/// code. Current x86 isa includes the following FP cmov instructions:
2567/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2568static bool hasFPCMov(unsigned X86CC) {
2569  switch (X86CC) {
2570  default:
2571    return false;
2572  case X86::COND_B:
2573  case X86::COND_BE:
2574  case X86::COND_E:
2575  case X86::COND_P:
2576  case X86::COND_A:
2577  case X86::COND_AE:
2578  case X86::COND_NE:
2579  case X86::COND_NP:
2580    return true;
2581  }
2582}
2583
2584/// isFPImmLegal - Returns true if the target can instruction select the
2585/// specified FP immediate natively. If false, the legalizer will
2586/// materialize the FP immediate as a load from a constant pool.
2587bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
2588  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
2589    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
2590      return true;
2591  }
2592  return false;
2593}
2594
2595/// isUndefOrInRange - Return true if Val is undef or if its value falls within
2596/// the specified range (L, H].
2597static bool isUndefOrInRange(int Val, int Low, int Hi) {
2598  return (Val < 0) || (Val >= Low && Val < Hi);
2599}
2600
2601/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2602/// specified value.
2603static bool isUndefOrEqual(int Val, int CmpVal) {
2604  if (Val < 0 || Val == CmpVal)
2605    return true;
2606  return false;
2607}
2608
2609/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2610/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
2611/// the second operand.
2612static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2613  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2614    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2615  if (VT == MVT::v2f64 || VT == MVT::v2i64)
2616    return (Mask[0] < 2 && Mask[1] < 2);
2617  return false;
2618}
2619
2620bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2621  SmallVector<int, 8> M;
2622  N->getMask(M);
2623  return ::isPSHUFDMask(M, N->getValueType(0));
2624}
2625
2626/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2627/// is suitable for input to PSHUFHW.
2628static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2629  if (VT != MVT::v8i16)
2630    return false;
2631
2632  // Lower quadword copied in order or undef.
2633  for (int i = 0; i != 4; ++i)
2634    if (Mask[i] >= 0 && Mask[i] != i)
2635      return false;
2636
2637  // Upper quadword shuffled.
2638  for (int i = 4; i != 8; ++i)
2639    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2640      return false;
2641
2642  return true;
2643}
2644
2645bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2646  SmallVector<int, 8> M;
2647  N->getMask(M);
2648  return ::isPSHUFHWMask(M, N->getValueType(0));
2649}
2650
2651/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2652/// is suitable for input to PSHUFLW.
2653static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2654  if (VT != MVT::v8i16)
2655    return false;
2656
2657  // Upper quadword copied in order.
2658  for (int i = 4; i != 8; ++i)
2659    if (Mask[i] >= 0 && Mask[i] != i)
2660      return false;
2661
2662  // Lower quadword shuffled.
2663  for (int i = 0; i != 4; ++i)
2664    if (Mask[i] >= 4)
2665      return false;
2666
2667  return true;
2668}
2669
2670bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2671  SmallVector<int, 8> M;
2672  N->getMask(M);
2673  return ::isPSHUFLWMask(M, N->getValueType(0));
2674}
2675
2676/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
2677/// is suitable for input to PALIGNR.
2678static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
2679                          bool hasSSSE3) {
2680  int i, e = VT.getVectorNumElements();
2681
2682  // Do not handle v2i64 / v2f64 shuffles with palignr.
2683  if (e < 4 || !hasSSSE3)
2684    return false;
2685
2686  for (i = 0; i != e; ++i)
2687    if (Mask[i] >= 0)
2688      break;
2689
2690  // All undef, not a palignr.
2691  if (i == e)
2692    return false;
2693
2694  // Determine if it's ok to perform a palignr with only the LHS, since we
2695  // don't have access to the actual shuffle elements to see if RHS is undef.
2696  bool Unary = Mask[i] < (int)e;
2697  bool NeedsUnary = false;
2698
2699  int s = Mask[i] - i;
2700
2701  // Check the rest of the elements to see if they are consecutive.
2702  for (++i; i != e; ++i) {
2703    int m = Mask[i];
2704    if (m < 0)
2705      continue;
2706
2707    Unary = Unary && (m < (int)e);
2708    NeedsUnary = NeedsUnary || (m < s);
2709
2710    if (NeedsUnary && !Unary)
2711      return false;
2712    if (Unary && m != ((s+i) & (e-1)))
2713      return false;
2714    if (!Unary && m != (s+i))
2715      return false;
2716  }
2717  return true;
2718}
2719
2720bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
2721  SmallVector<int, 8> M;
2722  N->getMask(M);
2723  return ::isPALIGNRMask(M, N->getValueType(0), true);
2724}
2725
2726/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2727/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2728static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2729  int NumElems = VT.getVectorNumElements();
2730  if (NumElems != 2 && NumElems != 4)
2731    return false;
2732
2733  int Half = NumElems / 2;
2734  for (int i = 0; i < Half; ++i)
2735    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2736      return false;
2737  for (int i = Half; i < NumElems; ++i)
2738    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2739      return false;
2740
2741  return true;
2742}
2743
2744bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2745  SmallVector<int, 8> M;
2746  N->getMask(M);
2747  return ::isSHUFPMask(M, N->getValueType(0));
2748}
2749
2750/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2751/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2752/// half elements to come from vector 1 (which would equal the dest.) and
2753/// the upper half to come from vector 2.
2754static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2755  int NumElems = VT.getVectorNumElements();
2756
2757  if (NumElems != 2 && NumElems != 4)
2758    return false;
2759
2760  int Half = NumElems / 2;
2761  for (int i = 0; i < Half; ++i)
2762    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2763      return false;
2764  for (int i = Half; i < NumElems; ++i)
2765    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2766      return false;
2767  return true;
2768}
2769
2770static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2771  SmallVector<int, 8> M;
2772  N->getMask(M);
2773  return isCommutedSHUFPMask(M, N->getValueType(0));
2774}
2775
2776/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2777/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2778bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2779  if (N->getValueType(0).getVectorNumElements() != 4)
2780    return false;
2781
2782  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2783  return isUndefOrEqual(N->getMaskElt(0), 6) &&
2784         isUndefOrEqual(N->getMaskElt(1), 7) &&
2785         isUndefOrEqual(N->getMaskElt(2), 2) &&
2786         isUndefOrEqual(N->getMaskElt(3), 3);
2787}
2788
2789/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2790/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2791/// <2, 3, 2, 3>
2792bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2793  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2794
2795  if (NumElems != 4)
2796    return false;
2797
2798  return isUndefOrEqual(N->getMaskElt(0), 2) &&
2799  isUndefOrEqual(N->getMaskElt(1), 3) &&
2800  isUndefOrEqual(N->getMaskElt(2), 2) &&
2801  isUndefOrEqual(N->getMaskElt(3), 3);
2802}
2803
2804/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2805/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2806bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2807  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2808
2809  if (NumElems != 2 && NumElems != 4)
2810    return false;
2811
2812  for (unsigned i = 0; i < NumElems/2; ++i)
2813    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2814      return false;
2815
2816  for (unsigned i = NumElems/2; i < NumElems; ++i)
2817    if (!isUndefOrEqual(N->getMaskElt(i), i))
2818      return false;
2819
2820  return true;
2821}
2822
2823/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
2824/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
2825bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
2826  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2827
2828  if (NumElems != 2 && NumElems != 4)
2829    return false;
2830
2831  for (unsigned i = 0; i < NumElems/2; ++i)
2832    if (!isUndefOrEqual(N->getMaskElt(i), i))
2833      return false;
2834
2835  for (unsigned i = 0; i < NumElems/2; ++i)
2836    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2837      return false;
2838
2839  return true;
2840}
2841
2842/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2843/// specifies a shuffle of elements that is suitable for input to UNPCKL.
2844static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2845                         bool V2IsSplat = false) {
2846  int NumElts = VT.getVectorNumElements();
2847  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2848    return false;
2849
2850  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2851    int BitI  = Mask[i];
2852    int BitI1 = Mask[i+1];
2853    if (!isUndefOrEqual(BitI, j))
2854      return false;
2855    if (V2IsSplat) {
2856      if (!isUndefOrEqual(BitI1, NumElts))
2857        return false;
2858    } else {
2859      if (!isUndefOrEqual(BitI1, j + NumElts))
2860        return false;
2861    }
2862  }
2863  return true;
2864}
2865
2866bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2867  SmallVector<int, 8> M;
2868  N->getMask(M);
2869  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2870}
2871
2872/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2873/// specifies a shuffle of elements that is suitable for input to UNPCKH.
2874static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
2875                         bool V2IsSplat = false) {
2876  int NumElts = VT.getVectorNumElements();
2877  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2878    return false;
2879
2880  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2881    int BitI  = Mask[i];
2882    int BitI1 = Mask[i+1];
2883    if (!isUndefOrEqual(BitI, j + NumElts/2))
2884      return false;
2885    if (V2IsSplat) {
2886      if (isUndefOrEqual(BitI1, NumElts))
2887        return false;
2888    } else {
2889      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2890        return false;
2891    }
2892  }
2893  return true;
2894}
2895
2896bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2897  SmallVector<int, 8> M;
2898  N->getMask(M);
2899  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2900}
2901
2902/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2903/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2904/// <0, 0, 1, 1>
2905static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2906  int NumElems = VT.getVectorNumElements();
2907  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2908    return false;
2909
2910  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2911    int BitI  = Mask[i];
2912    int BitI1 = Mask[i+1];
2913    if (!isUndefOrEqual(BitI, j))
2914      return false;
2915    if (!isUndefOrEqual(BitI1, j))
2916      return false;
2917  }
2918  return true;
2919}
2920
2921bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2922  SmallVector<int, 8> M;
2923  N->getMask(M);
2924  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2925}
2926
2927/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2928/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2929/// <2, 2, 3, 3>
2930static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2931  int NumElems = VT.getVectorNumElements();
2932  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2933    return false;
2934
2935  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2936    int BitI  = Mask[i];
2937    int BitI1 = Mask[i+1];
2938    if (!isUndefOrEqual(BitI, j))
2939      return false;
2940    if (!isUndefOrEqual(BitI1, j))
2941      return false;
2942  }
2943  return true;
2944}
2945
2946bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
2947  SmallVector<int, 8> M;
2948  N->getMask(M);
2949  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
2950}
2951
2952/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2953/// specifies a shuffle of elements that is suitable for input to MOVSS,
2954/// MOVSD, and MOVD, i.e. setting the lowest element.
2955static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2956  if (VT.getVectorElementType().getSizeInBits() < 32)
2957    return false;
2958
2959  int NumElts = VT.getVectorNumElements();
2960
2961  if (!isUndefOrEqual(Mask[0], NumElts))
2962    return false;
2963
2964  for (int i = 1; i < NumElts; ++i)
2965    if (!isUndefOrEqual(Mask[i], i))
2966      return false;
2967
2968  return true;
2969}
2970
2971bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
2972  SmallVector<int, 8> M;
2973  N->getMask(M);
2974  return ::isMOVLMask(M, N->getValueType(0));
2975}
2976
2977/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2978/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
2979/// element of vector 2 and the other elements to come from vector 1 in order.
2980static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2981                               bool V2IsSplat = false, bool V2IsUndef = false) {
2982  int NumOps = VT.getVectorNumElements();
2983  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2984    return false;
2985
2986  if (!isUndefOrEqual(Mask[0], 0))
2987    return false;
2988
2989  for (int i = 1; i < NumOps; ++i)
2990    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
2991          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
2992          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
2993      return false;
2994
2995  return true;
2996}
2997
2998static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
2999                           bool V2IsUndef = false) {
3000  SmallVector<int, 8> M;
3001  N->getMask(M);
3002  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
3003}
3004
3005/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3006/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3007bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
3008  if (N->getValueType(0).getVectorNumElements() != 4)
3009    return false;
3010
3011  // Expect 1, 1, 3, 3
3012  for (unsigned i = 0; i < 2; ++i) {
3013    int Elt = N->getMaskElt(i);
3014    if (Elt >= 0 && Elt != 1)
3015      return false;
3016  }
3017
3018  bool HasHi = false;
3019  for (unsigned i = 2; i < 4; ++i) {
3020    int Elt = N->getMaskElt(i);
3021    if (Elt >= 0 && Elt != 3)
3022      return false;
3023    if (Elt == 3)
3024      HasHi = true;
3025  }
3026  // Don't use movshdup if it can be done with a shufps.
3027  // FIXME: verify that matching u, u, 3, 3 is what we want.
3028  return HasHi;
3029}
3030
3031/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3032/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3033bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
3034  if (N->getValueType(0).getVectorNumElements() != 4)
3035    return false;
3036
3037  // Expect 0, 0, 2, 2
3038  for (unsigned i = 0; i < 2; ++i)
3039    if (N->getMaskElt(i) > 0)
3040      return false;
3041
3042  bool HasHi = false;
3043  for (unsigned i = 2; i < 4; ++i) {
3044    int Elt = N->getMaskElt(i);
3045    if (Elt >= 0 && Elt != 2)
3046      return false;
3047    if (Elt == 2)
3048      HasHi = true;
3049  }
3050  // Don't use movsldup if it can be done with a shufps.
3051  return HasHi;
3052}
3053
3054/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3055/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
3056bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
3057  int e = N->getValueType(0).getVectorNumElements() / 2;
3058
3059  for (int i = 0; i < e; ++i)
3060    if (!isUndefOrEqual(N->getMaskElt(i), i))
3061      return false;
3062  for (int i = 0; i < e; ++i)
3063    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
3064      return false;
3065  return true;
3066}
3067
3068/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3069/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3070unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
3071  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3072  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
3073
3074  unsigned Shift = (NumOperands == 4) ? 2 : 1;
3075  unsigned Mask = 0;
3076  for (int i = 0; i < NumOperands; ++i) {
3077    int Val = SVOp->getMaskElt(NumOperands-i-1);
3078    if (Val < 0) Val = 0;
3079    if (Val >= NumOperands) Val -= NumOperands;
3080    Mask |= Val;
3081    if (i != NumOperands - 1)
3082      Mask <<= Shift;
3083  }
3084  return Mask;
3085}
3086
3087/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3088/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3089unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
3090  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3091  unsigned Mask = 0;
3092  // 8 nodes, but we only care about the last 4.
3093  for (unsigned i = 7; i >= 4; --i) {
3094    int Val = SVOp->getMaskElt(i);
3095    if (Val >= 0)
3096      Mask |= (Val - 4);
3097    if (i != 4)
3098      Mask <<= 2;
3099  }
3100  return Mask;
3101}
3102
3103/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3104/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3105unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
3106  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3107  unsigned Mask = 0;
3108  // 8 nodes, but we only care about the first 4.
3109  for (int i = 3; i >= 0; --i) {
3110    int Val = SVOp->getMaskElt(i);
3111    if (Val >= 0)
3112      Mask |= Val;
3113    if (i != 0)
3114      Mask <<= 2;
3115  }
3116  return Mask;
3117}
3118
3119/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3120/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3121unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
3122  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3123  EVT VVT = N->getValueType(0);
3124  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
3125  int Val = 0;
3126
3127  unsigned i, e;
3128  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
3129    Val = SVOp->getMaskElt(i);
3130    if (Val >= 0)
3131      break;
3132  }
3133  return (Val - i) * EltSize;
3134}
3135
3136/// isZeroNode - Returns true if Elt is a constant zero or a floating point
3137/// constant +0.0.
3138bool X86::isZeroNode(SDValue Elt) {
3139  return ((isa<ConstantSDNode>(Elt) &&
3140           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
3141          (isa<ConstantFPSDNode>(Elt) &&
3142           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
3143}
3144
3145/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
3146/// their permute mask.
3147static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
3148                                    SelectionDAG &DAG) {
3149  EVT VT = SVOp->getValueType(0);
3150  unsigned NumElems = VT.getVectorNumElements();
3151  SmallVector<int, 8> MaskVec;
3152
3153  for (unsigned i = 0; i != NumElems; ++i) {
3154    int idx = SVOp->getMaskElt(i);
3155    if (idx < 0)
3156      MaskVec.push_back(idx);
3157    else if (idx < (int)NumElems)
3158      MaskVec.push_back(idx + NumElems);
3159    else
3160      MaskVec.push_back(idx - NumElems);
3161  }
3162  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
3163                              SVOp->getOperand(0), &MaskVec[0]);
3164}
3165
3166/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3167/// the two vector operands have swapped position.
3168static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
3169  unsigned NumElems = VT.getVectorNumElements();
3170  for (unsigned i = 0; i != NumElems; ++i) {
3171    int idx = Mask[i];
3172    if (idx < 0)
3173      continue;
3174    else if (idx < (int)NumElems)
3175      Mask[i] = idx + NumElems;
3176    else
3177      Mask[i] = idx - NumElems;
3178  }
3179}
3180
3181/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
3182/// match movhlps. The lower half elements should come from upper half of
3183/// V1 (and in order), and the upper half elements should come from the upper
3184/// half of V2 (and in order).
3185static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
3186  if (Op->getValueType(0).getVectorNumElements() != 4)
3187    return false;
3188  for (unsigned i = 0, e = 2; i != e; ++i)
3189    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
3190      return false;
3191  for (unsigned i = 2; i != 4; ++i)
3192    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
3193      return false;
3194  return true;
3195}
3196
3197/// isScalarLoadToVector - Returns true if the node is a scalar load that
3198/// is promoted to a vector. It also returns the LoadSDNode by reference if
3199/// required.
3200static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
3201  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
3202    return false;
3203  N = N->getOperand(0).getNode();
3204  if (!ISD::isNON_EXTLoad(N))
3205    return false;
3206  if (LD)
3207    *LD = cast<LoadSDNode>(N);
3208  return true;
3209}
3210
3211/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
3212/// match movlp{s|d}. The lower half elements should come from lower half of
3213/// V1 (and in order), and the upper half elements should come from the upper
3214/// half of V2 (and in order). And since V1 will become the source of the
3215/// MOVLP, it must be either a vector load or a scalar load to vector.
3216static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
3217                               ShuffleVectorSDNode *Op) {
3218  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
3219    return false;
3220  // Is V2 is a vector load, don't do this transformation. We will try to use
3221  // load folding shufps op.
3222  if (ISD::isNON_EXTLoad(V2))
3223    return false;
3224
3225  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
3226
3227  if (NumElems != 2 && NumElems != 4)
3228    return false;
3229  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3230    if (!isUndefOrEqual(Op->getMaskElt(i), i))
3231      return false;
3232  for (unsigned i = NumElems/2; i != NumElems; ++i)
3233    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
3234      return false;
3235  return true;
3236}
3237
3238/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
3239/// all the same.
3240static bool isSplatVector(SDNode *N) {
3241  if (N->getOpcode() != ISD::BUILD_VECTOR)
3242    return false;
3243
3244  SDValue SplatValue = N->getOperand(0);
3245  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
3246    if (N->getOperand(i) != SplatValue)
3247      return false;
3248  return true;
3249}
3250
3251/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
3252/// to an zero vector.
3253/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
3254static bool isZeroShuffle(ShuffleVectorSDNode *N) {
3255  SDValue V1 = N->getOperand(0);
3256  SDValue V2 = N->getOperand(1);
3257  unsigned NumElems = N->getValueType(0).getVectorNumElements();
3258  for (unsigned i = 0; i != NumElems; ++i) {
3259    int Idx = N->getMaskElt(i);
3260    if (Idx >= (int)NumElems) {
3261      unsigned Opc = V2.getOpcode();
3262      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
3263        continue;
3264      if (Opc != ISD::BUILD_VECTOR ||
3265          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
3266        return false;
3267    } else if (Idx >= 0) {
3268      unsigned Opc = V1.getOpcode();
3269      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
3270        continue;
3271      if (Opc != ISD::BUILD_VECTOR ||
3272          !X86::isZeroNode(V1.getOperand(Idx)))
3273        return false;
3274    }
3275  }
3276  return true;
3277}
3278
3279/// getZeroVector - Returns a vector of specified type with all zero elements.
3280///
3281static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
3282                             DebugLoc dl) {
3283  assert(VT.isVector() && "Expected a vector type");
3284
3285  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3286  // type.  This ensures they get CSE'd.
3287  SDValue Vec;
3288  if (VT.getSizeInBits() == 64) { // MMX
3289    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3290    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3291  } else if (HasSSE2) {  // SSE2
3292    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3293    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3294  } else { // SSE1
3295    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3296    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
3297  }
3298  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3299}
3300
3301/// getOnesVector - Returns a vector of specified type with all bits set.
3302///
3303static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3304  assert(VT.isVector() && "Expected a vector type");
3305
3306  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3307  // type.  This ensures they get CSE'd.
3308  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
3309  SDValue Vec;
3310  if (VT.getSizeInBits() == 64)  // MMX
3311    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3312  else                                              // SSE
3313    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3314  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3315}
3316
3317
3318/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
3319/// that point to V2 points to its first element.
3320static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3321  EVT VT = SVOp->getValueType(0);
3322  unsigned NumElems = VT.getVectorNumElements();
3323
3324  bool Changed = false;
3325  SmallVector<int, 8> MaskVec;
3326  SVOp->getMask(MaskVec);
3327
3328  for (unsigned i = 0; i != NumElems; ++i) {
3329    if (MaskVec[i] > (int)NumElems) {
3330      MaskVec[i] = NumElems;
3331      Changed = true;
3332    }
3333  }
3334  if (Changed)
3335    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
3336                                SVOp->getOperand(1), &MaskVec[0]);
3337  return SDValue(SVOp, 0);
3338}
3339
3340/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
3341/// operation of specified width.
3342static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3343                       SDValue V2) {
3344  unsigned NumElems = VT.getVectorNumElements();
3345  SmallVector<int, 8> Mask;
3346  Mask.push_back(NumElems);
3347  for (unsigned i = 1; i != NumElems; ++i)
3348    Mask.push_back(i);
3349  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3350}
3351
3352/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
3353static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3354                          SDValue V2) {
3355  unsigned NumElems = VT.getVectorNumElements();
3356  SmallVector<int, 8> Mask;
3357  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3358    Mask.push_back(i);
3359    Mask.push_back(i + NumElems);
3360  }
3361  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3362}
3363
3364/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
3365static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3366                          SDValue V2) {
3367  unsigned NumElems = VT.getVectorNumElements();
3368  unsigned Half = NumElems/2;
3369  SmallVector<int, 8> Mask;
3370  for (unsigned i = 0; i != Half; ++i) {
3371    Mask.push_back(i + Half);
3372    Mask.push_back(i + NumElems + Half);
3373  }
3374  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3375}
3376
3377/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
3378static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
3379                            bool HasSSE2) {
3380  if (SV->getValueType(0).getVectorNumElements() <= 4)
3381    return SDValue(SV, 0);
3382
3383  EVT PVT = MVT::v4f32;
3384  EVT VT = SV->getValueType(0);
3385  DebugLoc dl = SV->getDebugLoc();
3386  SDValue V1 = SV->getOperand(0);
3387  int NumElems = VT.getVectorNumElements();
3388  int EltNo = SV->getSplatIndex();
3389
3390  // unpack elements to the correct location
3391  while (NumElems > 4) {
3392    if (EltNo < NumElems/2) {
3393      V1 = getUnpackl(DAG, dl, VT, V1, V1);
3394    } else {
3395      V1 = getUnpackh(DAG, dl, VT, V1, V1);
3396      EltNo -= NumElems/2;
3397    }
3398    NumElems >>= 1;
3399  }
3400
3401  // Perform the splat.
3402  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3403  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3404  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3405  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3406}
3407
3408/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3409/// vector of zero or undef vector.  This produces a shuffle where the low
3410/// element of V2 is swizzled into the zero/undef vector, landing at element
3411/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3412static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3413                                             bool isZero, bool HasSSE2,
3414                                             SelectionDAG &DAG) {
3415  EVT VT = V2.getValueType();
3416  SDValue V1 = isZero
3417    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3418  unsigned NumElems = VT.getVectorNumElements();
3419  SmallVector<int, 16> MaskVec;
3420  for (unsigned i = 0; i != NumElems; ++i)
3421    // If this is the insertion idx, put the low elt of V2 here.
3422    MaskVec.push_back(i == Idx ? NumElems : i);
3423  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3424}
3425
3426/// getNumOfConsecutiveZeros - Return the number of elements in a result of
3427/// a shuffle that is zero.
3428static
3429unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3430                                  bool Low, SelectionDAG &DAG) {
3431  unsigned NumZeros = 0;
3432  for (int i = 0; i < NumElems; ++i) {
3433    unsigned Index = Low ? i : NumElems-i-1;
3434    int Idx = SVOp->getMaskElt(Index);
3435    if (Idx < 0) {
3436      ++NumZeros;
3437      continue;
3438    }
3439    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3440    if (Elt.getNode() && X86::isZeroNode(Elt))
3441      ++NumZeros;
3442    else
3443      break;
3444  }
3445  return NumZeros;
3446}
3447
3448/// isVectorShift - Returns true if the shuffle can be implemented as a
3449/// logical left or right shift of a vector.
3450/// FIXME: split into pslldqi, psrldqi, palignr variants.
3451static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3452                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3453  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
3454
3455  isLeft = true;
3456  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3457  if (!NumZeros) {
3458    isLeft = false;
3459    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3460    if (!NumZeros)
3461      return false;
3462  }
3463  bool SeenV1 = false;
3464  bool SeenV2 = false;
3465  for (unsigned i = NumZeros; i < NumElems; ++i) {
3466    unsigned Val = isLeft ? (i - NumZeros) : i;
3467    int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3468    if (Idx_ < 0)
3469      continue;
3470    unsigned Idx = (unsigned) Idx_;
3471    if (Idx < NumElems)
3472      SeenV1 = true;
3473    else {
3474      Idx -= NumElems;
3475      SeenV2 = true;
3476    }
3477    if (Idx != Val)
3478      return false;
3479  }
3480  if (SeenV1 && SeenV2)
3481    return false;
3482
3483  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3484  ShAmt = NumZeros;
3485  return true;
3486}
3487
3488
3489/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3490///
3491static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3492                                       unsigned NumNonZero, unsigned NumZero,
3493                                       SelectionDAG &DAG, TargetLowering &TLI) {
3494  if (NumNonZero > 8)
3495    return SDValue();
3496
3497  DebugLoc dl = Op.getDebugLoc();
3498  SDValue V(0, 0);
3499  bool First = true;
3500  for (unsigned i = 0; i < 16; ++i) {
3501    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3502    if (ThisIsNonZero && First) {
3503      if (NumZero)
3504        V = getZeroVector(MVT::v8i16, true, DAG, dl);
3505      else
3506        V = DAG.getUNDEF(MVT::v8i16);
3507      First = false;
3508    }
3509
3510    if ((i & 1) != 0) {
3511      SDValue ThisElt(0, 0), LastElt(0, 0);
3512      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3513      if (LastIsNonZero) {
3514        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3515                              MVT::i16, Op.getOperand(i-1));
3516      }
3517      if (ThisIsNonZero) {
3518        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3519        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3520                              ThisElt, DAG.getConstant(8, MVT::i8));
3521        if (LastIsNonZero)
3522          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3523      } else
3524        ThisElt = LastElt;
3525
3526      if (ThisElt.getNode())
3527        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3528                        DAG.getIntPtrConstant(i/2));
3529    }
3530  }
3531
3532  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3533}
3534
3535/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3536///
3537static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3538                                       unsigned NumNonZero, unsigned NumZero,
3539                                       SelectionDAG &DAG, TargetLowering &TLI) {
3540  if (NumNonZero > 4)
3541    return SDValue();
3542
3543  DebugLoc dl = Op.getDebugLoc();
3544  SDValue V(0, 0);
3545  bool First = true;
3546  for (unsigned i = 0; i < 8; ++i) {
3547    bool isNonZero = (NonZeros & (1 << i)) != 0;
3548    if (isNonZero) {
3549      if (First) {
3550        if (NumZero)
3551          V = getZeroVector(MVT::v8i16, true, DAG, dl);
3552        else
3553          V = DAG.getUNDEF(MVT::v8i16);
3554        First = false;
3555      }
3556      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3557                      MVT::v8i16, V, Op.getOperand(i),
3558                      DAG.getIntPtrConstant(i));
3559    }
3560  }
3561
3562  return V;
3563}
3564
3565/// getVShift - Return a vector logical shift node.
3566///
3567static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
3568                         unsigned NumBits, SelectionDAG &DAG,
3569                         const TargetLowering &TLI, DebugLoc dl) {
3570  bool isMMX = VT.getSizeInBits() == 64;
3571  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3572  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3573  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3574  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3575                     DAG.getNode(Opc, dl, ShVT, SrcOp,
3576                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3577}
3578
3579SDValue
3580X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
3581                                          SelectionDAG &DAG) {
3582
3583  // Check if the scalar load can be widened into a vector load. And if
3584  // the address is "base + cst" see if the cst can be "absorbed" into
3585  // the shuffle mask.
3586  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
3587    SDValue Ptr = LD->getBasePtr();
3588    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
3589      return SDValue();
3590    EVT PVT = LD->getValueType(0);
3591    if (PVT != MVT::i32 && PVT != MVT::f32)
3592      return SDValue();
3593
3594    int FI = -1;
3595    int64_t Offset = 0;
3596    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
3597      FI = FINode->getIndex();
3598      Offset = 0;
3599    } else if (Ptr.getOpcode() == ISD::ADD &&
3600               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
3601               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
3602      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
3603      Offset = Ptr.getConstantOperandVal(1);
3604      Ptr = Ptr.getOperand(0);
3605    } else {
3606      return SDValue();
3607    }
3608
3609    SDValue Chain = LD->getChain();
3610    // Make sure the stack object alignment is at least 16.
3611    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3612    if (DAG.InferPtrAlignment(Ptr) < 16) {
3613      if (MFI->isFixedObjectIndex(FI)) {
3614        // Can't change the alignment. FIXME: It's possible to compute
3615        // the exact stack offset and reference FI + adjust offset instead.
3616        // If someone *really* cares about this. That's the way to implement it.
3617        return SDValue();
3618      } else {
3619        MFI->setObjectAlignment(FI, 16);
3620      }
3621    }
3622
3623    // (Offset % 16) must be multiple of 4. Then address is then
3624    // Ptr + (Offset & ~15).
3625    if (Offset < 0)
3626      return SDValue();
3627    if ((Offset % 16) & 3)
3628      return SDValue();
3629    int64_t StartOffset = Offset & ~15;
3630    if (StartOffset)
3631      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
3632                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
3633
3634    int EltNo = (Offset - StartOffset) >> 2;
3635    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
3636    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
3637    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
3638                             false, false, 0);
3639    // Canonicalize it to a v4i32 shuffle.
3640    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
3641    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3642                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
3643                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
3644  }
3645
3646  return SDValue();
3647}
3648
3649/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
3650/// vector of type 'VT', see if the elements can be replaced by a single large
3651/// load which has the same value as a build_vector whose operands are 'elts'.
3652///
3653/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
3654///
3655/// FIXME: we'd also like to handle the case where the last elements are zero
3656/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
3657/// There's even a handy isZeroNode for that purpose.
3658static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
3659                                        DebugLoc &dl, SelectionDAG &DAG) {
3660  EVT EltVT = VT.getVectorElementType();
3661  unsigned NumElems = Elts.size();
3662
3663  LoadSDNode *LDBase = NULL;
3664  unsigned LastLoadedElt = -1U;
3665
3666  // For each element in the initializer, see if we've found a load or an undef.
3667  // If we don't find an initial load element, or later load elements are
3668  // non-consecutive, bail out.
3669  for (unsigned i = 0; i < NumElems; ++i) {
3670    SDValue Elt = Elts[i];
3671
3672    if (!Elt.getNode() ||
3673        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
3674      return SDValue();
3675    if (!LDBase) {
3676      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
3677        return SDValue();
3678      LDBase = cast<LoadSDNode>(Elt.getNode());
3679      LastLoadedElt = i;
3680      continue;
3681    }
3682    if (Elt.getOpcode() == ISD::UNDEF)
3683      continue;
3684
3685    LoadSDNode *LD = cast<LoadSDNode>(Elt);
3686    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
3687      return SDValue();
3688    LastLoadedElt = i;
3689  }
3690
3691  // If we have found an entire vector of loads and undefs, then return a large
3692  // load of the entire vector width starting at the base pointer.  If we found
3693  // consecutive loads for the low half, generate a vzext_load node.
3694  if (LastLoadedElt == NumElems - 1) {
3695    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
3696      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
3697                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
3698                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
3699    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
3700                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
3701                       LDBase->isVolatile(), LDBase->isNonTemporal(),
3702                       LDBase->getAlignment());
3703  } else if (NumElems == 4 && LastLoadedElt == 1) {
3704    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
3705    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
3706    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
3707    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
3708  }
3709  return SDValue();
3710}
3711
3712SDValue
3713X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3714  DebugLoc dl = Op.getDebugLoc();
3715  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3716  if (ISD::isBuildVectorAllZeros(Op.getNode())
3717      || ISD::isBuildVectorAllOnes(Op.getNode())) {
3718    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3719    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3720    // eliminated on x86-32 hosts.
3721    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3722      return Op;
3723
3724    if (ISD::isBuildVectorAllOnes(Op.getNode()))
3725      return getOnesVector(Op.getValueType(), DAG, dl);
3726    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3727  }
3728
3729  EVT VT = Op.getValueType();
3730  EVT ExtVT = VT.getVectorElementType();
3731  unsigned EVTBits = ExtVT.getSizeInBits();
3732
3733  unsigned NumElems = Op.getNumOperands();
3734  unsigned NumZero  = 0;
3735  unsigned NumNonZero = 0;
3736  unsigned NonZeros = 0;
3737  bool IsAllConstants = true;
3738  SmallSet<SDValue, 8> Values;
3739  for (unsigned i = 0; i < NumElems; ++i) {
3740    SDValue Elt = Op.getOperand(i);
3741    if (Elt.getOpcode() == ISD::UNDEF)
3742      continue;
3743    Values.insert(Elt);
3744    if (Elt.getOpcode() != ISD::Constant &&
3745        Elt.getOpcode() != ISD::ConstantFP)
3746      IsAllConstants = false;
3747    if (X86::isZeroNode(Elt))
3748      NumZero++;
3749    else {
3750      NonZeros |= (1 << i);
3751      NumNonZero++;
3752    }
3753  }
3754
3755  if (NumNonZero == 0) {
3756    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3757    return DAG.getUNDEF(VT);
3758  }
3759
3760  // Special case for single non-zero, non-undef, element.
3761  if (NumNonZero == 1) {
3762    unsigned Idx = CountTrailingZeros_32(NonZeros);
3763    SDValue Item = Op.getOperand(Idx);
3764
3765    // If this is an insertion of an i64 value on x86-32, and if the top bits of
3766    // the value are obviously zero, truncate the value to i32 and do the
3767    // insertion that way.  Only do this if the value is non-constant or if the
3768    // value is a constant being inserted into element 0.  It is cheaper to do
3769    // a constant pool load than it is to do a movd + shuffle.
3770    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
3771        (!IsAllConstants || Idx == 0)) {
3772      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3773        // Handle MMX and SSE both.
3774        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3775        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3776
3777        // Truncate the value (which may itself be a constant) to i32, and
3778        // convert it to a vector with movd (S2V+shuffle to zero extend).
3779        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3780        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3781        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3782                                           Subtarget->hasSSE2(), DAG);
3783
3784        // Now we have our 32-bit value zero extended in the low element of
3785        // a vector.  If Idx != 0, swizzle it into place.
3786        if (Idx != 0) {
3787          SmallVector<int, 4> Mask;
3788          Mask.push_back(Idx);
3789          for (unsigned i = 1; i != VecElts; ++i)
3790            Mask.push_back(i);
3791          Item = DAG.getVectorShuffle(VecVT, dl, Item,
3792                                      DAG.getUNDEF(Item.getValueType()),
3793                                      &Mask[0]);
3794        }
3795        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3796      }
3797    }
3798
3799    // If we have a constant or non-constant insertion into the low element of
3800    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3801    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3802    // depending on what the source datatype is.
3803    if (Idx == 0) {
3804      if (NumZero == 0) {
3805        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3806      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
3807          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
3808        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3809        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3810        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3811                                           DAG);
3812      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
3813        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3814        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3815        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3816        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3817                                           Subtarget->hasSSE2(), DAG);
3818        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3819      }
3820    }
3821
3822    // Is it a vector logical left shift?
3823    if (NumElems == 2 && Idx == 1 &&
3824        X86::isZeroNode(Op.getOperand(0)) &&
3825        !X86::isZeroNode(Op.getOperand(1))) {
3826      unsigned NumBits = VT.getSizeInBits();
3827      return getVShift(true, VT,
3828                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3829                                   VT, Op.getOperand(1)),
3830                       NumBits/2, DAG, *this, dl);
3831    }
3832
3833    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3834      return SDValue();
3835
3836    // Otherwise, if this is a vector with i32 or f32 elements, and the element
3837    // is a non-constant being inserted into an element other than the low one,
3838    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3839    // movd/movss) to move this into the low element, then shuffle it into
3840    // place.
3841    if (EVTBits == 32) {
3842      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3843
3844      // Turn it into a shuffle of zero and zero-extended scalar to vector.
3845      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3846                                         Subtarget->hasSSE2(), DAG);
3847      SmallVector<int, 8> MaskVec;
3848      for (unsigned i = 0; i < NumElems; i++)
3849        MaskVec.push_back(i == Idx ? 0 : 1);
3850      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3851    }
3852  }
3853
3854  // Splat is obviously ok. Let legalizer expand it to a shuffle.
3855  if (Values.size() == 1) {
3856    if (EVTBits == 32) {
3857      // Instead of a shuffle like this:
3858      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
3859      // Check if it's possible to issue this instead.
3860      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
3861      unsigned Idx = CountTrailingZeros_32(NonZeros);
3862      SDValue Item = Op.getOperand(Idx);
3863      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
3864        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
3865    }
3866    return SDValue();
3867  }
3868
3869  // A vector full of immediates; various special cases are already
3870  // handled, so this is best done with a single constant-pool load.
3871  if (IsAllConstants)
3872    return SDValue();
3873
3874  // Let legalizer expand 2-wide build_vectors.
3875  if (EVTBits == 64) {
3876    if (NumNonZero == 1) {
3877      // One half is zero or undef.
3878      unsigned Idx = CountTrailingZeros_32(NonZeros);
3879      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3880                                 Op.getOperand(Idx));
3881      return getShuffleVectorZeroOrUndef(V2, Idx, true,
3882                                         Subtarget->hasSSE2(), DAG);
3883    }
3884    return SDValue();
3885  }
3886
3887  // If element VT is < 32 bits, convert it to inserts into a zero vector.
3888  if (EVTBits == 8 && NumElems == 16) {
3889    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3890                                        *this);
3891    if (V.getNode()) return V;
3892  }
3893
3894  if (EVTBits == 16 && NumElems == 8) {
3895    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3896                                        *this);
3897    if (V.getNode()) return V;
3898  }
3899
3900  // If element VT is == 32 bits, turn it into a number of shuffles.
3901  SmallVector<SDValue, 8> V;
3902  V.resize(NumElems);
3903  if (NumElems == 4 && NumZero > 0) {
3904    for (unsigned i = 0; i < 4; ++i) {
3905      bool isZero = !(NonZeros & (1 << i));
3906      if (isZero)
3907        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3908      else
3909        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3910    }
3911
3912    for (unsigned i = 0; i < 2; ++i) {
3913      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3914        default: break;
3915        case 0:
3916          V[i] = V[i*2];  // Must be a zero vector.
3917          break;
3918        case 1:
3919          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3920          break;
3921        case 2:
3922          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3923          break;
3924        case 3:
3925          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3926          break;
3927      }
3928    }
3929
3930    SmallVector<int, 8> MaskVec;
3931    bool Reverse = (NonZeros & 0x3) == 2;
3932    for (unsigned i = 0; i < 2; ++i)
3933      MaskVec.push_back(Reverse ? 1-i : i);
3934    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3935    for (unsigned i = 0; i < 2; ++i)
3936      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
3937    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
3938  }
3939
3940  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
3941    // Check for a build vector of consecutive loads.
3942    for (unsigned i = 0; i < NumElems; ++i)
3943      V[i] = Op.getOperand(i);
3944
3945    // Check for elements which are consecutive loads.
3946    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
3947    if (LD.getNode())
3948      return LD;
3949
3950    // For SSE 4.1, use inserts into undef.
3951    if (getSubtarget()->hasSSE41()) {
3952      V[0] = DAG.getUNDEF(VT);
3953      for (unsigned i = 0; i < NumElems; ++i)
3954        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
3955          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
3956                             Op.getOperand(i), DAG.getIntPtrConstant(i));
3957      return V[0];
3958    }
3959
3960    // Otherwise, expand into a number of unpckl*
3961    // e.g. for v4f32
3962    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3963    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3964    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
3965    for (unsigned i = 0; i < NumElems; ++i)
3966      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3967    NumElems >>= 1;
3968    while (NumElems != 0) {
3969      for (unsigned i = 0; i < NumElems; ++i)
3970        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
3971      NumElems >>= 1;
3972    }
3973    return V[0];
3974  }
3975  return SDValue();
3976}
3977
3978SDValue
3979X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
3980  // We support concatenate two MMX registers and place them in a MMX
3981  // register.  This is better than doing a stack convert.
3982  DebugLoc dl = Op.getDebugLoc();
3983  EVT ResVT = Op.getValueType();
3984  assert(Op.getNumOperands() == 2);
3985  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
3986         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
3987  int Mask[2];
3988  SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
3989  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
3990  InVec = Op.getOperand(1);
3991  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
3992    unsigned NumElts = ResVT.getVectorNumElements();
3993    VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
3994    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
3995                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
3996  } else {
3997    InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
3998    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
3999    Mask[0] = 0; Mask[1] = 2;
4000    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
4001  }
4002  return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4003}
4004
4005// v8i16 shuffles - Prefer shuffles in the following order:
4006// 1. [all]   pshuflw, pshufhw, optional move
4007// 2. [ssse3] 1 x pshufb
4008// 3. [ssse3] 2 x pshufb + 1 x por
4009// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
4010static
4011SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
4012                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
4013  SDValue V1 = SVOp->getOperand(0);
4014  SDValue V2 = SVOp->getOperand(1);
4015  DebugLoc dl = SVOp->getDebugLoc();
4016  SmallVector<int, 8> MaskVals;
4017
4018  // Determine if more than 1 of the words in each of the low and high quadwords
4019  // of the result come from the same quadword of one of the two inputs.  Undef
4020  // mask values count as coming from any quadword, for better codegen.
4021  SmallVector<unsigned, 4> LoQuad(4);
4022  SmallVector<unsigned, 4> HiQuad(4);
4023  BitVector InputQuads(4);
4024  for (unsigned i = 0; i < 8; ++i) {
4025    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
4026    int EltIdx = SVOp->getMaskElt(i);
4027    MaskVals.push_back(EltIdx);
4028    if (EltIdx < 0) {
4029      ++Quad[0];
4030      ++Quad[1];
4031      ++Quad[2];
4032      ++Quad[3];
4033      continue;
4034    }
4035    ++Quad[EltIdx / 4];
4036    InputQuads.set(EltIdx / 4);
4037  }
4038
4039  int BestLoQuad = -1;
4040  unsigned MaxQuad = 1;
4041  for (unsigned i = 0; i < 4; ++i) {
4042    if (LoQuad[i] > MaxQuad) {
4043      BestLoQuad = i;
4044      MaxQuad = LoQuad[i];
4045    }
4046  }
4047
4048  int BestHiQuad = -1;
4049  MaxQuad = 1;
4050  for (unsigned i = 0; i < 4; ++i) {
4051    if (HiQuad[i] > MaxQuad) {
4052      BestHiQuad = i;
4053      MaxQuad = HiQuad[i];
4054    }
4055  }
4056
4057  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
4058  // of the two input vectors, shuffle them into one input vector so only a
4059  // single pshufb instruction is necessary. If There are more than 2 input
4060  // quads, disable the next transformation since it does not help SSSE3.
4061  bool V1Used = InputQuads[0] || InputQuads[1];
4062  bool V2Used = InputQuads[2] || InputQuads[3];
4063  if (TLI.getSubtarget()->hasSSSE3()) {
4064    if (InputQuads.count() == 2 && V1Used && V2Used) {
4065      BestLoQuad = InputQuads.find_first();
4066      BestHiQuad = InputQuads.find_next(BestLoQuad);
4067    }
4068    if (InputQuads.count() > 2) {
4069      BestLoQuad = -1;
4070      BestHiQuad = -1;
4071    }
4072  }
4073
4074  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
4075  // the shuffle mask.  If a quad is scored as -1, that means that it contains
4076  // words from all 4 input quadwords.
4077  SDValue NewV;
4078  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
4079    SmallVector<int, 8> MaskV;
4080    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
4081    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
4082    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
4083                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
4084                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
4085    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
4086
4087    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
4088    // source words for the shuffle, to aid later transformations.
4089    bool AllWordsInNewV = true;
4090    bool InOrder[2] = { true, true };
4091    for (unsigned i = 0; i != 8; ++i) {
4092      int idx = MaskVals[i];
4093      if (idx != (int)i)
4094        InOrder[i/4] = false;
4095      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
4096        continue;
4097      AllWordsInNewV = false;
4098      break;
4099    }
4100
4101    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
4102    if (AllWordsInNewV) {
4103      for (int i = 0; i != 8; ++i) {
4104        int idx = MaskVals[i];
4105        if (idx < 0)
4106          continue;
4107        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
4108        if ((idx != i) && idx < 4)
4109          pshufhw = false;
4110        if ((idx != i) && idx > 3)
4111          pshuflw = false;
4112      }
4113      V1 = NewV;
4114      V2Used = false;
4115      BestLoQuad = 0;
4116      BestHiQuad = 1;
4117    }
4118
4119    // If we've eliminated the use of V2, and the new mask is a pshuflw or
4120    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
4121    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
4122      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
4123                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
4124    }
4125  }
4126
4127  // If we have SSSE3, and all words of the result are from 1 input vector,
4128  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
4129  // is present, fall back to case 4.
4130  if (TLI.getSubtarget()->hasSSSE3()) {
4131    SmallVector<SDValue,16> pshufbMask;
4132
4133    // If we have elements from both input vectors, set the high bit of the
4134    // shuffle mask element to zero out elements that come from V2 in the V1
4135    // mask, and elements that come from V1 in the V2 mask, so that the two
4136    // results can be OR'd together.
4137    bool TwoInputs = V1Used && V2Used;
4138    for (unsigned i = 0; i != 8; ++i) {
4139      int EltIdx = MaskVals[i] * 2;
4140      if (TwoInputs && (EltIdx >= 16)) {
4141        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4142        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4143        continue;
4144      }
4145      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
4146      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
4147    }
4148    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
4149    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4150                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4151                                 MVT::v16i8, &pshufbMask[0], 16));
4152    if (!TwoInputs)
4153      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4154
4155    // Calculate the shuffle mask for the second input, shuffle it, and
4156    // OR it with the first shuffled input.
4157    pshufbMask.clear();
4158    for (unsigned i = 0; i != 8; ++i) {
4159      int EltIdx = MaskVals[i] * 2;
4160      if (EltIdx < 16) {
4161        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4162        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4163        continue;
4164      }
4165      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4166      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
4167    }
4168    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
4169    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4170                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4171                                 MVT::v16i8, &pshufbMask[0], 16));
4172    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4173    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4174  }
4175
4176  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
4177  // and update MaskVals with new element order.
4178  BitVector InOrder(8);
4179  if (BestLoQuad >= 0) {
4180    SmallVector<int, 8> MaskV;
4181    for (int i = 0; i != 4; ++i) {
4182      int idx = MaskVals[i];
4183      if (idx < 0) {
4184        MaskV.push_back(-1);
4185        InOrder.set(i);
4186      } else if ((idx / 4) == BestLoQuad) {
4187        MaskV.push_back(idx & 3);
4188        InOrder.set(i);
4189      } else {
4190        MaskV.push_back(-1);
4191      }
4192    }
4193    for (unsigned i = 4; i != 8; ++i)
4194      MaskV.push_back(i);
4195    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4196                                &MaskV[0]);
4197  }
4198
4199  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
4200  // and update MaskVals with the new element order.
4201  if (BestHiQuad >= 0) {
4202    SmallVector<int, 8> MaskV;
4203    for (unsigned i = 0; i != 4; ++i)
4204      MaskV.push_back(i);
4205    for (unsigned i = 4; i != 8; ++i) {
4206      int idx = MaskVals[i];
4207      if (idx < 0) {
4208        MaskV.push_back(-1);
4209        InOrder.set(i);
4210      } else if ((idx / 4) == BestHiQuad) {
4211        MaskV.push_back((idx & 3) + 4);
4212        InOrder.set(i);
4213      } else {
4214        MaskV.push_back(-1);
4215      }
4216    }
4217    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4218                                &MaskV[0]);
4219  }
4220
4221  // In case BestHi & BestLo were both -1, which means each quadword has a word
4222  // from each of the four input quadwords, calculate the InOrder bitvector now
4223  // before falling through to the insert/extract cleanup.
4224  if (BestLoQuad == -1 && BestHiQuad == -1) {
4225    NewV = V1;
4226    for (int i = 0; i != 8; ++i)
4227      if (MaskVals[i] < 0 || MaskVals[i] == i)
4228        InOrder.set(i);
4229  }
4230
4231  // The other elements are put in the right place using pextrw and pinsrw.
4232  for (unsigned i = 0; i != 8; ++i) {
4233    if (InOrder[i])
4234      continue;
4235    int EltIdx = MaskVals[i];
4236    if (EltIdx < 0)
4237      continue;
4238    SDValue ExtOp = (EltIdx < 8)
4239    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
4240                  DAG.getIntPtrConstant(EltIdx))
4241    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
4242                  DAG.getIntPtrConstant(EltIdx - 8));
4243    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
4244                       DAG.getIntPtrConstant(i));
4245  }
4246  return NewV;
4247}
4248
4249// v16i8 shuffles - Prefer shuffles in the following order:
4250// 1. [ssse3] 1 x pshufb
4251// 2. [ssse3] 2 x pshufb + 1 x por
4252// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
4253static
4254SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
4255                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
4256  SDValue V1 = SVOp->getOperand(0);
4257  SDValue V2 = SVOp->getOperand(1);
4258  DebugLoc dl = SVOp->getDebugLoc();
4259  SmallVector<int, 16> MaskVals;
4260  SVOp->getMask(MaskVals);
4261
4262  // If we have SSSE3, case 1 is generated when all result bytes come from
4263  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
4264  // present, fall back to case 3.
4265  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
4266  bool V1Only = true;
4267  bool V2Only = true;
4268  for (unsigned i = 0; i < 16; ++i) {
4269    int EltIdx = MaskVals[i];
4270    if (EltIdx < 0)
4271      continue;
4272    if (EltIdx < 16)
4273      V2Only = false;
4274    else
4275      V1Only = false;
4276  }
4277
4278  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
4279  if (TLI.getSubtarget()->hasSSSE3()) {
4280    SmallVector<SDValue,16> pshufbMask;
4281
4282    // If all result elements are from one input vector, then only translate
4283    // undef mask values to 0x80 (zero out result) in the pshufb mask.
4284    //
4285    // Otherwise, we have elements from both input vectors, and must zero out
4286    // elements that come from V2 in the first mask, and V1 in the second mask
4287    // so that we can OR them together.
4288    bool TwoInputs = !(V1Only || V2Only);
4289    for (unsigned i = 0; i != 16; ++i) {
4290      int EltIdx = MaskVals[i];
4291      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
4292        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4293        continue;
4294      }
4295      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
4296    }
4297    // If all the elements are from V2, assign it to V1 and return after
4298    // building the first pshufb.
4299    if (V2Only)
4300      V1 = V2;
4301    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4302                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4303                                 MVT::v16i8, &pshufbMask[0], 16));
4304    if (!TwoInputs)
4305      return V1;
4306
4307    // Calculate the shuffle mask for the second input, shuffle it, and
4308    // OR it with the first shuffled input.
4309    pshufbMask.clear();
4310    for (unsigned i = 0; i != 16; ++i) {
4311      int EltIdx = MaskVals[i];
4312      if (EltIdx < 16) {
4313        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4314        continue;
4315      }
4316      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4317    }
4318    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4319                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4320                                 MVT::v16i8, &pshufbMask[0], 16));
4321    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4322  }
4323
4324  // No SSSE3 - Calculate in place words and then fix all out of place words
4325  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
4326  // the 16 different words that comprise the two doublequadword input vectors.
4327  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4328  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
4329  SDValue NewV = V2Only ? V2 : V1;
4330  for (int i = 0; i != 8; ++i) {
4331    int Elt0 = MaskVals[i*2];
4332    int Elt1 = MaskVals[i*2+1];
4333
4334    // This word of the result is all undef, skip it.
4335    if (Elt0 < 0 && Elt1 < 0)
4336      continue;
4337
4338    // This word of the result is already in the correct place, skip it.
4339    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
4340      continue;
4341    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
4342      continue;
4343
4344    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
4345    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
4346    SDValue InsElt;
4347
4348    // If Elt0 and Elt1 are defined, are consecutive, and can be load
4349    // using a single extract together, load it and store it.
4350    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
4351      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4352                           DAG.getIntPtrConstant(Elt1 / 2));
4353      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4354                        DAG.getIntPtrConstant(i));
4355      continue;
4356    }
4357
4358    // If Elt1 is defined, extract it from the appropriate source.  If the
4359    // source byte is not also odd, shift the extracted word left 8 bits
4360    // otherwise clear the bottom 8 bits if we need to do an or.
4361    if (Elt1 >= 0) {
4362      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4363                           DAG.getIntPtrConstant(Elt1 / 2));
4364      if ((Elt1 & 1) == 0)
4365        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
4366                             DAG.getConstant(8, TLI.getShiftAmountTy()));
4367      else if (Elt0 >= 0)
4368        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
4369                             DAG.getConstant(0xFF00, MVT::i16));
4370    }
4371    // If Elt0 is defined, extract it from the appropriate source.  If the
4372    // source byte is not also even, shift the extracted word right 8 bits. If
4373    // Elt1 was also defined, OR the extracted values together before
4374    // inserting them in the result.
4375    if (Elt0 >= 0) {
4376      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
4377                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
4378      if ((Elt0 & 1) != 0)
4379        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
4380                              DAG.getConstant(8, TLI.getShiftAmountTy()));
4381      else if (Elt1 >= 0)
4382        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
4383                             DAG.getConstant(0x00FF, MVT::i16));
4384      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
4385                         : InsElt0;
4386    }
4387    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4388                       DAG.getIntPtrConstant(i));
4389  }
4390  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
4391}
4392
4393/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
4394/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
4395/// done when every pair / quad of shuffle mask elements point to elements in
4396/// the right sequence. e.g.
4397/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
4398static
4399SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
4400                                 SelectionDAG &DAG,
4401                                 TargetLowering &TLI, DebugLoc dl) {
4402  EVT VT = SVOp->getValueType(0);
4403  SDValue V1 = SVOp->getOperand(0);
4404  SDValue V2 = SVOp->getOperand(1);
4405  unsigned NumElems = VT.getVectorNumElements();
4406  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
4407  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
4408  EVT MaskEltVT = MaskVT.getVectorElementType();
4409  EVT NewVT = MaskVT;
4410  switch (VT.getSimpleVT().SimpleTy) {
4411  default: assert(false && "Unexpected!");
4412  case MVT::v4f32: NewVT = MVT::v2f64; break;
4413  case MVT::v4i32: NewVT = MVT::v2i64; break;
4414  case MVT::v8i16: NewVT = MVT::v4i32; break;
4415  case MVT::v16i8: NewVT = MVT::v4i32; break;
4416  }
4417
4418  if (NewWidth == 2) {
4419    if (VT.isInteger())
4420      NewVT = MVT::v2i64;
4421    else
4422      NewVT = MVT::v2f64;
4423  }
4424  int Scale = NumElems / NewWidth;
4425  SmallVector<int, 8> MaskVec;
4426  for (unsigned i = 0; i < NumElems; i += Scale) {
4427    int StartIdx = -1;
4428    for (int j = 0; j < Scale; ++j) {
4429      int EltIdx = SVOp->getMaskElt(i+j);
4430      if (EltIdx < 0)
4431        continue;
4432      if (StartIdx == -1)
4433        StartIdx = EltIdx - (EltIdx % Scale);
4434      if (EltIdx != StartIdx + j)
4435        return SDValue();
4436    }
4437    if (StartIdx == -1)
4438      MaskVec.push_back(-1);
4439    else
4440      MaskVec.push_back(StartIdx / Scale);
4441  }
4442
4443  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
4444  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
4445  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
4446}
4447
4448/// getVZextMovL - Return a zero-extending vector move low node.
4449///
4450static SDValue getVZextMovL(EVT VT, EVT OpVT,
4451                            SDValue SrcOp, SelectionDAG &DAG,
4452                            const X86Subtarget *Subtarget, DebugLoc dl) {
4453  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
4454    LoadSDNode *LD = NULL;
4455    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
4456      LD = dyn_cast<LoadSDNode>(SrcOp);
4457    if (!LD) {
4458      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
4459      // instead.
4460      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
4461      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
4462          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
4463          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
4464          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
4465        // PR2108
4466        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
4467        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4468                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4469                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4470                                                   OpVT,
4471                                                   SrcOp.getOperand(0)
4472                                                          .getOperand(0))));
4473      }
4474    }
4475  }
4476
4477  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4478                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4479                                 DAG.getNode(ISD::BIT_CONVERT, dl,
4480                                             OpVT, SrcOp)));
4481}
4482
4483/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
4484/// shuffles.
4485static SDValue
4486LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4487  SDValue V1 = SVOp->getOperand(0);
4488  SDValue V2 = SVOp->getOperand(1);
4489  DebugLoc dl = SVOp->getDebugLoc();
4490  EVT VT = SVOp->getValueType(0);
4491
4492  SmallVector<std::pair<int, int>, 8> Locs;
4493  Locs.resize(4);
4494  SmallVector<int, 8> Mask1(4U, -1);
4495  SmallVector<int, 8> PermMask;
4496  SVOp->getMask(PermMask);
4497
4498  unsigned NumHi = 0;
4499  unsigned NumLo = 0;
4500  for (unsigned i = 0; i != 4; ++i) {
4501    int Idx = PermMask[i];
4502    if (Idx < 0) {
4503      Locs[i] = std::make_pair(-1, -1);
4504    } else {
4505      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
4506      if (Idx < 4) {
4507        Locs[i] = std::make_pair(0, NumLo);
4508        Mask1[NumLo] = Idx;
4509        NumLo++;
4510      } else {
4511        Locs[i] = std::make_pair(1, NumHi);
4512        if (2+NumHi < 4)
4513          Mask1[2+NumHi] = Idx;
4514        NumHi++;
4515      }
4516    }
4517  }
4518
4519  if (NumLo <= 2 && NumHi <= 2) {
4520    // If no more than two elements come from either vector. This can be
4521    // implemented with two shuffles. First shuffle gather the elements.
4522    // The second shuffle, which takes the first shuffle as both of its
4523    // vector operands, put the elements into the right order.
4524    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4525
4526    SmallVector<int, 8> Mask2(4U, -1);
4527
4528    for (unsigned i = 0; i != 4; ++i) {
4529      if (Locs[i].first == -1)
4530        continue;
4531      else {
4532        unsigned Idx = (i < 2) ? 0 : 4;
4533        Idx += Locs[i].first * 2 + Locs[i].second;
4534        Mask2[i] = Idx;
4535      }
4536    }
4537
4538    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
4539  } else if (NumLo == 3 || NumHi == 3) {
4540    // Otherwise, we must have three elements from one vector, call it X, and
4541    // one element from the other, call it Y.  First, use a shufps to build an
4542    // intermediate vector with the one element from Y and the element from X
4543    // that will be in the same half in the final destination (the indexes don't
4544    // matter). Then, use a shufps to build the final vector, taking the half
4545    // containing the element from Y from the intermediate, and the other half
4546    // from X.
4547    if (NumHi == 3) {
4548      // Normalize it so the 3 elements come from V1.
4549      CommuteVectorShuffleMask(PermMask, VT);
4550      std::swap(V1, V2);
4551    }
4552
4553    // Find the element from V2.
4554    unsigned HiIndex;
4555    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
4556      int Val = PermMask[HiIndex];
4557      if (Val < 0)
4558        continue;
4559      if (Val >= 4)
4560        break;
4561    }
4562
4563    Mask1[0] = PermMask[HiIndex];
4564    Mask1[1] = -1;
4565    Mask1[2] = PermMask[HiIndex^1];
4566    Mask1[3] = -1;
4567    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4568
4569    if (HiIndex >= 2) {
4570      Mask1[0] = PermMask[0];
4571      Mask1[1] = PermMask[1];
4572      Mask1[2] = HiIndex & 1 ? 6 : 4;
4573      Mask1[3] = HiIndex & 1 ? 4 : 6;
4574      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4575    } else {
4576      Mask1[0] = HiIndex & 1 ? 2 : 0;
4577      Mask1[1] = HiIndex & 1 ? 0 : 2;
4578      Mask1[2] = PermMask[2];
4579      Mask1[3] = PermMask[3];
4580      if (Mask1[2] >= 0)
4581        Mask1[2] += 4;
4582      if (Mask1[3] >= 0)
4583        Mask1[3] += 4;
4584      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4585    }
4586  }
4587
4588  // Break it into (shuffle shuffle_hi, shuffle_lo).
4589  Locs.clear();
4590  SmallVector<int,8> LoMask(4U, -1);
4591  SmallVector<int,8> HiMask(4U, -1);
4592
4593  SmallVector<int,8> *MaskPtr = &LoMask;
4594  unsigned MaskIdx = 0;
4595  unsigned LoIdx = 0;
4596  unsigned HiIdx = 2;
4597  for (unsigned i = 0; i != 4; ++i) {
4598    if (i == 2) {
4599      MaskPtr = &HiMask;
4600      MaskIdx = 1;
4601      LoIdx = 0;
4602      HiIdx = 2;
4603    }
4604    int Idx = PermMask[i];
4605    if (Idx < 0) {
4606      Locs[i] = std::make_pair(-1, -1);
4607    } else if (Idx < 4) {
4608      Locs[i] = std::make_pair(MaskIdx, LoIdx);
4609      (*MaskPtr)[LoIdx] = Idx;
4610      LoIdx++;
4611    } else {
4612      Locs[i] = std::make_pair(MaskIdx, HiIdx);
4613      (*MaskPtr)[HiIdx] = Idx;
4614      HiIdx++;
4615    }
4616  }
4617
4618  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4619  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4620  SmallVector<int, 8> MaskOps;
4621  for (unsigned i = 0; i != 4; ++i) {
4622    if (Locs[i].first == -1) {
4623      MaskOps.push_back(-1);
4624    } else {
4625      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4626      MaskOps.push_back(Idx);
4627    }
4628  }
4629  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4630}
4631
4632SDValue
4633X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4634  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4635  SDValue V1 = Op.getOperand(0);
4636  SDValue V2 = Op.getOperand(1);
4637  EVT VT = Op.getValueType();
4638  DebugLoc dl = Op.getDebugLoc();
4639  unsigned NumElems = VT.getVectorNumElements();
4640  bool isMMX = VT.getSizeInBits() == 64;
4641  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4642  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4643  bool V1IsSplat = false;
4644  bool V2IsSplat = false;
4645
4646  if (isZeroShuffle(SVOp))
4647    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4648
4649  // Promote splats to v4f32.
4650  if (SVOp->isSplat()) {
4651    if (isMMX || NumElems < 4)
4652      return Op;
4653    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4654  }
4655
4656  // If the shuffle can be profitably rewritten as a narrower shuffle, then
4657  // do it!
4658  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4659    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4660    if (NewOp.getNode())
4661      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4662                         LowerVECTOR_SHUFFLE(NewOp, DAG));
4663  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4664    // FIXME: Figure out a cleaner way to do this.
4665    // Try to make use of movq to zero out the top part.
4666    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4667      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4668      if (NewOp.getNode()) {
4669        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4670          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4671                              DAG, Subtarget, dl);
4672      }
4673    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4674      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4675      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4676        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4677                            DAG, Subtarget, dl);
4678    }
4679  }
4680
4681  if (X86::isPSHUFDMask(SVOp))
4682    return Op;
4683
4684  // Check if this can be converted into a logical shift.
4685  bool isLeft = false;
4686  unsigned ShAmt = 0;
4687  SDValue ShVal;
4688  bool isShift = getSubtarget()->hasSSE2() &&
4689    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4690  if (isShift && ShVal.hasOneUse()) {
4691    // If the shifted value has multiple uses, it may be cheaper to use
4692    // v_set0 + movlhps or movhlps, etc.
4693    EVT EltVT = VT.getVectorElementType();
4694    ShAmt *= EltVT.getSizeInBits();
4695    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4696  }
4697
4698  if (X86::isMOVLMask(SVOp)) {
4699    if (V1IsUndef)
4700      return V2;
4701    if (ISD::isBuildVectorAllZeros(V1.getNode()))
4702      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4703    if (!isMMX)
4704      return Op;
4705  }
4706
4707  // FIXME: fold these into legal mask.
4708  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4709                 X86::isMOVSLDUPMask(SVOp) ||
4710                 X86::isMOVHLPSMask(SVOp) ||
4711                 X86::isMOVLHPSMask(SVOp) ||
4712                 X86::isMOVLPMask(SVOp)))
4713    return Op;
4714
4715  if (ShouldXformToMOVHLPS(SVOp) ||
4716      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4717    return CommuteVectorShuffle(SVOp, DAG);
4718
4719  if (isShift) {
4720    // No better options. Use a vshl / vsrl.
4721    EVT EltVT = VT.getVectorElementType();
4722    ShAmt *= EltVT.getSizeInBits();
4723    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4724  }
4725
4726  bool Commuted = false;
4727  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4728  // 1,1,1,1 -> v8i16 though.
4729  V1IsSplat = isSplatVector(V1.getNode());
4730  V2IsSplat = isSplatVector(V2.getNode());
4731
4732  // Canonicalize the splat or undef, if present, to be on the RHS.
4733  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4734    Op = CommuteVectorShuffle(SVOp, DAG);
4735    SVOp = cast<ShuffleVectorSDNode>(Op);
4736    V1 = SVOp->getOperand(0);
4737    V2 = SVOp->getOperand(1);
4738    std::swap(V1IsSplat, V2IsSplat);
4739    std::swap(V1IsUndef, V2IsUndef);
4740    Commuted = true;
4741  }
4742
4743  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4744    // Shuffling low element of v1 into undef, just return v1.
4745    if (V2IsUndef)
4746      return V1;
4747    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4748    // the instruction selector will not match, so get a canonical MOVL with
4749    // swapped operands to undo the commute.
4750    return getMOVL(DAG, dl, VT, V2, V1);
4751  }
4752
4753  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4754      X86::isUNPCKH_v_undef_Mask(SVOp) ||
4755      X86::isUNPCKLMask(SVOp) ||
4756      X86::isUNPCKHMask(SVOp))
4757    return Op;
4758
4759  if (V2IsSplat) {
4760    // Normalize mask so all entries that point to V2 points to its first
4761    // element then try to match unpck{h|l} again. If match, return a
4762    // new vector_shuffle with the corrected mask.
4763    SDValue NewMask = NormalizeMask(SVOp, DAG);
4764    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4765    if (NSVOp != SVOp) {
4766      if (X86::isUNPCKLMask(NSVOp, true)) {
4767        return NewMask;
4768      } else if (X86::isUNPCKHMask(NSVOp, true)) {
4769        return NewMask;
4770      }
4771    }
4772  }
4773
4774  if (Commuted) {
4775    // Commute is back and try unpck* again.
4776    // FIXME: this seems wrong.
4777    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4778    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4779    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4780        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4781        X86::isUNPCKLMask(NewSVOp) ||
4782        X86::isUNPCKHMask(NewSVOp))
4783      return NewOp;
4784  }
4785
4786  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4787
4788  // Normalize the node to match x86 shuffle ops if needed
4789  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4790    return CommuteVectorShuffle(SVOp, DAG);
4791
4792  // Check for legal shuffle and return?
4793  SmallVector<int, 16> PermMask;
4794  SVOp->getMask(PermMask);
4795  if (isShuffleMaskLegal(PermMask, VT))
4796    return Op;
4797
4798  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4799  if (VT == MVT::v8i16) {
4800    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4801    if (NewOp.getNode())
4802      return NewOp;
4803  }
4804
4805  if (VT == MVT::v16i8) {
4806    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4807    if (NewOp.getNode())
4808      return NewOp;
4809  }
4810
4811  // Handle all 4 wide cases with a number of shuffles except for MMX.
4812  if (NumElems == 4 && !isMMX)
4813    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4814
4815  return SDValue();
4816}
4817
4818SDValue
4819X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4820                                                SelectionDAG &DAG) {
4821  EVT VT = Op.getValueType();
4822  DebugLoc dl = Op.getDebugLoc();
4823  if (VT.getSizeInBits() == 8) {
4824    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4825                                    Op.getOperand(0), Op.getOperand(1));
4826    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4827                                    DAG.getValueType(VT));
4828    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4829  } else if (VT.getSizeInBits() == 16) {
4830    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4831    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4832    if (Idx == 0)
4833      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4834                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4835                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4836                                                 MVT::v4i32,
4837                                                 Op.getOperand(0)),
4838                                     Op.getOperand(1)));
4839    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4840                                    Op.getOperand(0), Op.getOperand(1));
4841    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4842                                    DAG.getValueType(VT));
4843    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4844  } else if (VT == MVT::f32) {
4845    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4846    // the result back to FR32 register. It's only worth matching if the
4847    // result has a single use which is a store or a bitcast to i32.  And in
4848    // the case of a store, it's not worth it if the index is a constant 0,
4849    // because a MOVSSmr can be used instead, which is smaller and faster.
4850    if (!Op.hasOneUse())
4851      return SDValue();
4852    SDNode *User = *Op.getNode()->use_begin();
4853    if ((User->getOpcode() != ISD::STORE ||
4854         (isa<ConstantSDNode>(Op.getOperand(1)) &&
4855          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4856        (User->getOpcode() != ISD::BIT_CONVERT ||
4857         User->getValueType(0) != MVT::i32))
4858      return SDValue();
4859    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4860                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4861                                              Op.getOperand(0)),
4862                                              Op.getOperand(1));
4863    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4864  } else if (VT == MVT::i32) {
4865    // ExtractPS works with constant index.
4866    if (isa<ConstantSDNode>(Op.getOperand(1)))
4867      return Op;
4868  }
4869  return SDValue();
4870}
4871
4872
4873SDValue
4874X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4875  if (!isa<ConstantSDNode>(Op.getOperand(1)))
4876    return SDValue();
4877
4878  if (Subtarget->hasSSE41()) {
4879    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4880    if (Res.getNode())
4881      return Res;
4882  }
4883
4884  EVT VT = Op.getValueType();
4885  DebugLoc dl = Op.getDebugLoc();
4886  // TODO: handle v16i8.
4887  if (VT.getSizeInBits() == 16) {
4888    SDValue Vec = Op.getOperand(0);
4889    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4890    if (Idx == 0)
4891      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4892                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4893                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4894                                                 MVT::v4i32, Vec),
4895                                     Op.getOperand(1)));
4896    // Transform it so it match pextrw which produces a 32-bit result.
4897    EVT EltVT = MVT::i32;
4898    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
4899                                    Op.getOperand(0), Op.getOperand(1));
4900    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
4901                                    DAG.getValueType(VT));
4902    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4903  } else if (VT.getSizeInBits() == 32) {
4904    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4905    if (Idx == 0)
4906      return Op;
4907
4908    // SHUFPS the element to the lowest double word, then movss.
4909    int Mask[4] = { Idx, -1, -1, -1 };
4910    EVT VVT = Op.getOperand(0).getValueType();
4911    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4912                                       DAG.getUNDEF(VVT), Mask);
4913    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4914                       DAG.getIntPtrConstant(0));
4915  } else if (VT.getSizeInBits() == 64) {
4916    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4917    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4918    //        to match extract_elt for f64.
4919    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4920    if (Idx == 0)
4921      return Op;
4922
4923    // UNPCKHPD the element to the lowest double word, then movsd.
4924    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4925    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4926    int Mask[2] = { 1, -1 };
4927    EVT VVT = Op.getOperand(0).getValueType();
4928    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4929                                       DAG.getUNDEF(VVT), Mask);
4930    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4931                       DAG.getIntPtrConstant(0));
4932  }
4933
4934  return SDValue();
4935}
4936
4937SDValue
4938X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4939  EVT VT = Op.getValueType();
4940  EVT EltVT = VT.getVectorElementType();
4941  DebugLoc dl = Op.getDebugLoc();
4942
4943  SDValue N0 = Op.getOperand(0);
4944  SDValue N1 = Op.getOperand(1);
4945  SDValue N2 = Op.getOperand(2);
4946
4947  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
4948      isa<ConstantSDNode>(N2)) {
4949    unsigned Opc;
4950    if (VT == MVT::v8i16)
4951      Opc = X86ISD::PINSRW;
4952    else if (VT == MVT::v4i16)
4953      Opc = X86ISD::MMX_PINSRW;
4954    else if (VT == MVT::v16i8)
4955      Opc = X86ISD::PINSRB;
4956    else
4957      Opc = X86ISD::PINSRB;
4958
4959    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4960    // argument.
4961    if (N1.getValueType() != MVT::i32)
4962      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4963    if (N2.getValueType() != MVT::i32)
4964      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4965    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
4966  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4967    // Bits [7:6] of the constant are the source select.  This will always be
4968    //  zero here.  The DAG Combiner may combine an extract_elt index into these
4969    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
4970    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
4971    // Bits [5:4] of the constant are the destination select.  This is the
4972    //  value of the incoming immediate.
4973    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
4974    //   combine either bitwise AND or insert of float 0.0 to set these bits.
4975    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4976    // Create this as a scalar to vector..
4977    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
4978    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
4979  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
4980    // PINSR* works with constant index.
4981    return Op;
4982  }
4983  return SDValue();
4984}
4985
4986SDValue
4987X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4988  EVT VT = Op.getValueType();
4989  EVT EltVT = VT.getVectorElementType();
4990
4991  if (Subtarget->hasSSE41())
4992    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4993
4994  if (EltVT == MVT::i8)
4995    return SDValue();
4996
4997  DebugLoc dl = Op.getDebugLoc();
4998  SDValue N0 = Op.getOperand(0);
4999  SDValue N1 = Op.getOperand(1);
5000  SDValue N2 = Op.getOperand(2);
5001
5002  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
5003    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
5004    // as its second argument.
5005    if (N1.getValueType() != MVT::i32)
5006      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5007    if (N2.getValueType() != MVT::i32)
5008      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5009    return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
5010                       dl, VT, N0, N1, N2);
5011  }
5012  return SDValue();
5013}
5014
5015SDValue
5016X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
5017  DebugLoc dl = Op.getDebugLoc();
5018  if (Op.getValueType() == MVT::v2f32)
5019    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
5020                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
5021                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
5022                                               Op.getOperand(0))));
5023
5024  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
5025    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
5026
5027  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
5028  EVT VT = MVT::v2i32;
5029  switch (Op.getValueType().getSimpleVT().SimpleTy) {
5030  default: break;
5031  case MVT::v16i8:
5032  case MVT::v8i16:
5033    VT = MVT::v4i32;
5034    break;
5035  }
5036  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
5037                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
5038}
5039
5040// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
5041// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
5042// one of the above mentioned nodes. It has to be wrapped because otherwise
5043// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
5044// be used to form addressing mode. These wrapped nodes will be selected
5045// into MOV32ri.
5046SDValue
5047X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
5048  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
5049
5050  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5051  // global base reg.
5052  unsigned char OpFlag = 0;
5053  unsigned WrapperKind = X86ISD::Wrapper;
5054  CodeModel::Model M = getTargetMachine().getCodeModel();
5055
5056  if (Subtarget->isPICStyleRIPRel() &&
5057      (M == CodeModel::Small || M == CodeModel::Kernel))
5058    WrapperKind = X86ISD::WrapperRIP;
5059  else if (Subtarget->isPICStyleGOT())
5060    OpFlag = X86II::MO_GOTOFF;
5061  else if (Subtarget->isPICStyleStubPIC())
5062    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5063
5064  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
5065                                             CP->getAlignment(),
5066                                             CP->getOffset(), OpFlag);
5067  DebugLoc DL = CP->getDebugLoc();
5068  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5069  // With PIC, the address is actually $g + Offset.
5070  if (OpFlag) {
5071    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5072                         DAG.getNode(X86ISD::GlobalBaseReg,
5073                                     DebugLoc(), getPointerTy()),
5074                         Result);
5075  }
5076
5077  return Result;
5078}
5079
5080SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
5081  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
5082
5083  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5084  // global base reg.
5085  unsigned char OpFlag = 0;
5086  unsigned WrapperKind = X86ISD::Wrapper;
5087  CodeModel::Model M = getTargetMachine().getCodeModel();
5088
5089  if (Subtarget->isPICStyleRIPRel() &&
5090      (M == CodeModel::Small || M == CodeModel::Kernel))
5091    WrapperKind = X86ISD::WrapperRIP;
5092  else if (Subtarget->isPICStyleGOT())
5093    OpFlag = X86II::MO_GOTOFF;
5094  else if (Subtarget->isPICStyleStubPIC())
5095    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5096
5097  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
5098                                          OpFlag);
5099  DebugLoc DL = JT->getDebugLoc();
5100  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5101
5102  // With PIC, the address is actually $g + Offset.
5103  if (OpFlag) {
5104    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5105                         DAG.getNode(X86ISD::GlobalBaseReg,
5106                                     DebugLoc(), getPointerTy()),
5107                         Result);
5108  }
5109
5110  return Result;
5111}
5112
5113SDValue
5114X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
5115  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
5116
5117  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5118  // global base reg.
5119  unsigned char OpFlag = 0;
5120  unsigned WrapperKind = X86ISD::Wrapper;
5121  CodeModel::Model M = getTargetMachine().getCodeModel();
5122
5123  if (Subtarget->isPICStyleRIPRel() &&
5124      (M == CodeModel::Small || M == CodeModel::Kernel))
5125    WrapperKind = X86ISD::WrapperRIP;
5126  else if (Subtarget->isPICStyleGOT())
5127    OpFlag = X86II::MO_GOTOFF;
5128  else if (Subtarget->isPICStyleStubPIC())
5129    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5130
5131  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
5132
5133  DebugLoc DL = Op.getDebugLoc();
5134  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5135
5136
5137  // With PIC, the address is actually $g + Offset.
5138  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
5139      !Subtarget->is64Bit()) {
5140    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5141                         DAG.getNode(X86ISD::GlobalBaseReg,
5142                                     DebugLoc(), getPointerTy()),
5143                         Result);
5144  }
5145
5146  return Result;
5147}
5148
5149SDValue
5150X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
5151  // Create the TargetBlockAddressAddress node.
5152  unsigned char OpFlags =
5153    Subtarget->ClassifyBlockAddressReference();
5154  CodeModel::Model M = getTargetMachine().getCodeModel();
5155  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
5156  DebugLoc dl = Op.getDebugLoc();
5157  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
5158                                       /*isTarget=*/true, OpFlags);
5159
5160  if (Subtarget->isPICStyleRIPRel() &&
5161      (M == CodeModel::Small || M == CodeModel::Kernel))
5162    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5163  else
5164    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5165
5166  // With PIC, the address is actually $g + Offset.
5167  if (isGlobalRelativeToPICBase(OpFlags)) {
5168    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5169                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5170                         Result);
5171  }
5172
5173  return Result;
5174}
5175
5176SDValue
5177X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
5178                                      int64_t Offset,
5179                                      SelectionDAG &DAG) const {
5180  // Create the TargetGlobalAddress node, folding in the constant
5181  // offset if it is legal.
5182  unsigned char OpFlags =
5183    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5184  CodeModel::Model M = getTargetMachine().getCodeModel();
5185  SDValue Result;
5186  if (OpFlags == X86II::MO_NO_FLAG &&
5187      X86::isOffsetSuitableForCodeModel(Offset, M)) {
5188    // A direct static reference to a global.
5189    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
5190    Offset = 0;
5191  } else {
5192    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
5193  }
5194
5195  if (Subtarget->isPICStyleRIPRel() &&
5196      (M == CodeModel::Small || M == CodeModel::Kernel))
5197    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5198  else
5199    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5200
5201  // With PIC, the address is actually $g + Offset.
5202  if (isGlobalRelativeToPICBase(OpFlags)) {
5203    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5204                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5205                         Result);
5206  }
5207
5208  // For globals that require a load from a stub to get the address, emit the
5209  // load.
5210  if (isGlobalStubReference(OpFlags))
5211    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
5212                         PseudoSourceValue::getGOT(), 0, false, false, 0);
5213
5214  // If there was a non-zero offset that we didn't fold, create an explicit
5215  // addition for it.
5216  if (Offset != 0)
5217    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
5218                         DAG.getConstant(Offset, getPointerTy()));
5219
5220  return Result;
5221}
5222
5223SDValue
5224X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
5225  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5226  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
5227  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
5228}
5229
5230static SDValue
5231GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
5232           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
5233           unsigned char OperandFlags) {
5234  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5235  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5236  DebugLoc dl = GA->getDebugLoc();
5237  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
5238                                           GA->getValueType(0),
5239                                           GA->getOffset(),
5240                                           OperandFlags);
5241  if (InFlag) {
5242    SDValue Ops[] = { Chain,  TGA, *InFlag };
5243    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
5244  } else {
5245    SDValue Ops[]  = { Chain, TGA };
5246    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
5247  }
5248
5249  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
5250  MFI->setHasCalls(true);
5251
5252  SDValue Flag = Chain.getValue(1);
5253  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
5254}
5255
5256// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
5257static SDValue
5258LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5259                                const EVT PtrVT) {
5260  SDValue InFlag;
5261  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
5262  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
5263                                     DAG.getNode(X86ISD::GlobalBaseReg,
5264                                                 DebugLoc(), PtrVT), InFlag);
5265  InFlag = Chain.getValue(1);
5266
5267  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
5268}
5269
5270// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
5271static SDValue
5272LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5273                                const EVT PtrVT) {
5274  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
5275                    X86::RAX, X86II::MO_TLSGD);
5276}
5277
5278// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
5279// "local exec" model.
5280static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5281                                   const EVT PtrVT, TLSModel::Model model,
5282                                   bool is64Bit) {
5283  DebugLoc dl = GA->getDebugLoc();
5284  // Get the Thread Pointer
5285  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
5286                             DebugLoc(), PtrVT,
5287                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
5288                                             MVT::i32));
5289
5290  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
5291                                      NULL, 0, false, false, 0);
5292
5293  unsigned char OperandFlags = 0;
5294  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
5295  // initialexec.
5296  unsigned WrapperKind = X86ISD::Wrapper;
5297  if (model == TLSModel::LocalExec) {
5298    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
5299  } else if (is64Bit) {
5300    assert(model == TLSModel::InitialExec);
5301    OperandFlags = X86II::MO_GOTTPOFF;
5302    WrapperKind = X86ISD::WrapperRIP;
5303  } else {
5304    assert(model == TLSModel::InitialExec);
5305    OperandFlags = X86II::MO_INDNTPOFF;
5306  }
5307
5308  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
5309  // exec)
5310  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
5311                                           GA->getOffset(), OperandFlags);
5312  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
5313
5314  if (model == TLSModel::InitialExec)
5315    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
5316                         PseudoSourceValue::getGOT(), 0, false, false, 0);
5317
5318  // The address of the thread local variable is the add of the thread
5319  // pointer with the offset of the variable.
5320  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
5321}
5322
5323SDValue
5324X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
5325  // TODO: implement the "local dynamic" model
5326  // TODO: implement the "initial exec"model for pic executables
5327  assert(Subtarget->isTargetELF() &&
5328         "TLS not implemented for non-ELF targets");
5329  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
5330  const GlobalValue *GV = GA->getGlobal();
5331
5332  // If GV is an alias then use the aliasee for determining
5333  // thread-localness.
5334  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
5335    GV = GA->resolveAliasedGlobal(false);
5336
5337  TLSModel::Model model = getTLSModel(GV,
5338                                      getTargetMachine().getRelocationModel());
5339
5340  switch (model) {
5341  case TLSModel::GeneralDynamic:
5342  case TLSModel::LocalDynamic: // not implemented
5343    if (Subtarget->is64Bit())
5344      return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
5345    return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
5346
5347  case TLSModel::InitialExec:
5348  case TLSModel::LocalExec:
5349    return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
5350                               Subtarget->is64Bit());
5351  }
5352
5353  llvm_unreachable("Unreachable");
5354  return SDValue();
5355}
5356
5357
5358/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
5359/// take a 2 x i32 value to shift plus a shift amount.
5360SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
5361  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5362  EVT VT = Op.getValueType();
5363  unsigned VTBits = VT.getSizeInBits();
5364  DebugLoc dl = Op.getDebugLoc();
5365  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
5366  SDValue ShOpLo = Op.getOperand(0);
5367  SDValue ShOpHi = Op.getOperand(1);
5368  SDValue ShAmt  = Op.getOperand(2);
5369  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
5370                                     DAG.getConstant(VTBits - 1, MVT::i8))
5371                       : DAG.getConstant(0, VT);
5372
5373  SDValue Tmp2, Tmp3;
5374  if (Op.getOpcode() == ISD::SHL_PARTS) {
5375    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
5376    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5377  } else {
5378    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
5379    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
5380  }
5381
5382  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
5383                                DAG.getConstant(VTBits, MVT::i8));
5384  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
5385                             AndNode, DAG.getConstant(0, MVT::i8));
5386
5387  SDValue Hi, Lo;
5388  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5389  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
5390  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
5391
5392  if (Op.getOpcode() == ISD::SHL_PARTS) {
5393    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5394    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5395  } else {
5396    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5397    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5398  }
5399
5400  SDValue Ops[2] = { Lo, Hi };
5401  return DAG.getMergeValues(Ops, 2, dl);
5402}
5403
5404SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5405  EVT SrcVT = Op.getOperand(0).getValueType();
5406
5407  if (SrcVT.isVector()) {
5408    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
5409      return Op;
5410    }
5411    return SDValue();
5412  }
5413
5414  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
5415         "Unknown SINT_TO_FP to lower!");
5416
5417  // These are really Legal; return the operand so the caller accepts it as
5418  // Legal.
5419  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
5420    return Op;
5421  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
5422      Subtarget->is64Bit()) {
5423    return Op;
5424  }
5425
5426  DebugLoc dl = Op.getDebugLoc();
5427  unsigned Size = SrcVT.getSizeInBits()/8;
5428  MachineFunction &MF = DAG.getMachineFunction();
5429  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
5430  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5431  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5432                               StackSlot,
5433                               PseudoSourceValue::getFixedStack(SSFI), 0,
5434                               false, false, 0);
5435  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
5436}
5437
5438SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
5439                                     SDValue StackSlot,
5440                                     SelectionDAG &DAG) {
5441  // Build the FILD
5442  DebugLoc dl = Op.getDebugLoc();
5443  SDVTList Tys;
5444  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
5445  if (useSSE)
5446    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
5447  else
5448    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
5449  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
5450  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
5451                               Tys, Ops, array_lengthof(Ops));
5452
5453  if (useSSE) {
5454    Chain = Result.getValue(1);
5455    SDValue InFlag = Result.getValue(2);
5456
5457    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
5458    // shouldn't be necessary except that RFP cannot be live across
5459    // multiple blocks. When stackifier is fixed, they can be uncoupled.
5460    MachineFunction &MF = DAG.getMachineFunction();
5461    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
5462    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5463    Tys = DAG.getVTList(MVT::Other);
5464    SDValue Ops[] = {
5465      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
5466    };
5467    Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
5468    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
5469                         PseudoSourceValue::getFixedStack(SSFI), 0,
5470                         false, false, 0);
5471  }
5472
5473  return Result;
5474}
5475
5476// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
5477SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
5478  // This algorithm is not obvious. Here it is in C code, more or less:
5479  /*
5480    double uint64_to_double( uint32_t hi, uint32_t lo ) {
5481      static const __m128i exp = { 0x4330000045300000ULL, 0 };
5482      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
5483
5484      // Copy ints to xmm registers.
5485      __m128i xh = _mm_cvtsi32_si128( hi );
5486      __m128i xl = _mm_cvtsi32_si128( lo );
5487
5488      // Combine into low half of a single xmm register.
5489      __m128i x = _mm_unpacklo_epi32( xh, xl );
5490      __m128d d;
5491      double sd;
5492
5493      // Merge in appropriate exponents to give the integer bits the right
5494      // magnitude.
5495      x = _mm_unpacklo_epi32( x, exp );
5496
5497      // Subtract away the biases to deal with the IEEE-754 double precision
5498      // implicit 1.
5499      d = _mm_sub_pd( (__m128d) x, bias );
5500
5501      // All conversions up to here are exact. The correctly rounded result is
5502      // calculated using the current rounding mode using the following
5503      // horizontal add.
5504      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
5505      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
5506                                // store doesn't really need to be here (except
5507                                // maybe to zero the other double)
5508      return sd;
5509    }
5510  */
5511
5512  DebugLoc dl = Op.getDebugLoc();
5513  LLVMContext *Context = DAG.getContext();
5514
5515  // Build some magic constants.
5516  std::vector<Constant*> CV0;
5517  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
5518  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
5519  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5520  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5521  Constant *C0 = ConstantVector::get(CV0);
5522  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
5523
5524  std::vector<Constant*> CV1;
5525  CV1.push_back(
5526    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
5527  CV1.push_back(
5528    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
5529  Constant *C1 = ConstantVector::get(CV1);
5530  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
5531
5532  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5533                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5534                                        Op.getOperand(0),
5535                                        DAG.getIntPtrConstant(1)));
5536  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5537                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5538                                        Op.getOperand(0),
5539                                        DAG.getIntPtrConstant(0)));
5540  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
5541  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
5542                              PseudoSourceValue::getConstantPool(), 0,
5543                              false, false, 16);
5544  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
5545  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
5546  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
5547                              PseudoSourceValue::getConstantPool(), 0,
5548                              false, false, 16);
5549  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
5550
5551  // Add the halves; easiest way is to swap them into another reg first.
5552  int ShufMask[2] = { 1, -1 };
5553  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
5554                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
5555  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
5556  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
5557                     DAG.getIntPtrConstant(0));
5558}
5559
5560// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
5561SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
5562  DebugLoc dl = Op.getDebugLoc();
5563  // FP constant to bias correct the final result.
5564  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
5565                                   MVT::f64);
5566
5567  // Load the 32-bit value into an XMM register.
5568  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5569                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5570                                         Op.getOperand(0),
5571                                         DAG.getIntPtrConstant(0)));
5572
5573  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5574                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
5575                     DAG.getIntPtrConstant(0));
5576
5577  // Or the load with the bias.
5578  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
5579                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5580                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5581                                                   MVT::v2f64, Load)),
5582                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5583                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5584                                                   MVT::v2f64, Bias)));
5585  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5586                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
5587                   DAG.getIntPtrConstant(0));
5588
5589  // Subtract the bias.
5590  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
5591
5592  // Handle final rounding.
5593  EVT DestVT = Op.getValueType();
5594
5595  if (DestVT.bitsLT(MVT::f64)) {
5596    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5597                       DAG.getIntPtrConstant(0));
5598  } else if (DestVT.bitsGT(MVT::f64)) {
5599    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5600  }
5601
5602  // Handle final rounding.
5603  return Sub;
5604}
5605
5606SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5607  SDValue N0 = Op.getOperand(0);
5608  DebugLoc dl = Op.getDebugLoc();
5609
5610  // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
5611  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5612  // the optimization here.
5613  if (DAG.SignBitIsZero(N0))
5614    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5615
5616  EVT SrcVT = N0.getValueType();
5617  if (SrcVT == MVT::i64) {
5618    // We only handle SSE2 f64 target here; caller can expand the rest.
5619    if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
5620      return SDValue();
5621
5622    return LowerUINT_TO_FP_i64(Op, DAG);
5623  } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
5624    return LowerUINT_TO_FP_i32(Op, DAG);
5625  }
5626
5627  assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
5628
5629  // Make a 64-bit buffer, and use it to build an FILD.
5630  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5631  SDValue WordOff = DAG.getConstant(4, getPointerTy());
5632  SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5633                                   getPointerTy(), StackSlot, WordOff);
5634  SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5635                                StackSlot, NULL, 0, false, false, 0);
5636  SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5637                                OffsetSlot, NULL, 0, false, false, 0);
5638  return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5639}
5640
5641std::pair<SDValue,SDValue> X86TargetLowering::
5642FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
5643  DebugLoc dl = Op.getDebugLoc();
5644
5645  EVT DstTy = Op.getValueType();
5646
5647  if (!IsSigned) {
5648    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5649    DstTy = MVT::i64;
5650  }
5651
5652  assert(DstTy.getSimpleVT() <= MVT::i64 &&
5653         DstTy.getSimpleVT() >= MVT::i16 &&
5654         "Unknown FP_TO_SINT to lower!");
5655
5656  // These are really Legal.
5657  if (DstTy == MVT::i32 &&
5658      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5659    return std::make_pair(SDValue(), SDValue());
5660  if (Subtarget->is64Bit() &&
5661      DstTy == MVT::i64 &&
5662      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5663    return std::make_pair(SDValue(), SDValue());
5664
5665  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5666  // stack slot.
5667  MachineFunction &MF = DAG.getMachineFunction();
5668  unsigned MemSize = DstTy.getSizeInBits()/8;
5669  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5670  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5671
5672  unsigned Opc;
5673  switch (DstTy.getSimpleVT().SimpleTy) {
5674  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5675  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5676  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5677  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5678  }
5679
5680  SDValue Chain = DAG.getEntryNode();
5681  SDValue Value = Op.getOperand(0);
5682  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5683    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5684    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5685                         PseudoSourceValue::getFixedStack(SSFI), 0,
5686                         false, false, 0);
5687    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5688    SDValue Ops[] = {
5689      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5690    };
5691    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5692    Chain = Value.getValue(1);
5693    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5694    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5695  }
5696
5697  // Build the FP_TO_INT*_IN_MEM
5698  SDValue Ops[] = { Chain, Value, StackSlot };
5699  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5700
5701  return std::make_pair(FIST, StackSlot);
5702}
5703
5704SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
5705  if (Op.getValueType().isVector()) {
5706    if (Op.getValueType() == MVT::v2i32 &&
5707        Op.getOperand(0).getValueType() == MVT::v2f64) {
5708      return Op;
5709    }
5710    return SDValue();
5711  }
5712
5713  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5714  SDValue FIST = Vals.first, StackSlot = Vals.second;
5715  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5716  if (FIST.getNode() == 0) return Op;
5717
5718  // Load the result.
5719  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5720                     FIST, StackSlot, NULL, 0, false, false, 0);
5721}
5722
5723SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
5724  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5725  SDValue FIST = Vals.first, StackSlot = Vals.second;
5726  assert(FIST.getNode() && "Unexpected failure");
5727
5728  // Load the result.
5729  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5730                     FIST, StackSlot, NULL, 0, false, false, 0);
5731}
5732
5733SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
5734  LLVMContext *Context = DAG.getContext();
5735  DebugLoc dl = Op.getDebugLoc();
5736  EVT VT = Op.getValueType();
5737  EVT EltVT = VT;
5738  if (VT.isVector())
5739    EltVT = VT.getVectorElementType();
5740  std::vector<Constant*> CV;
5741  if (EltVT == MVT::f64) {
5742    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5743    CV.push_back(C);
5744    CV.push_back(C);
5745  } else {
5746    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5747    CV.push_back(C);
5748    CV.push_back(C);
5749    CV.push_back(C);
5750    CV.push_back(C);
5751  }
5752  Constant *C = ConstantVector::get(CV);
5753  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5754  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5755                             PseudoSourceValue::getConstantPool(), 0,
5756                             false, false, 16);
5757  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5758}
5759
5760SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
5761  LLVMContext *Context = DAG.getContext();
5762  DebugLoc dl = Op.getDebugLoc();
5763  EVT VT = Op.getValueType();
5764  EVT EltVT = VT;
5765  if (VT.isVector())
5766    EltVT = VT.getVectorElementType();
5767  std::vector<Constant*> CV;
5768  if (EltVT == MVT::f64) {
5769    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5770    CV.push_back(C);
5771    CV.push_back(C);
5772  } else {
5773    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5774    CV.push_back(C);
5775    CV.push_back(C);
5776    CV.push_back(C);
5777    CV.push_back(C);
5778  }
5779  Constant *C = ConstantVector::get(CV);
5780  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5781  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5782                             PseudoSourceValue::getConstantPool(), 0,
5783                             false, false, 16);
5784  if (VT.isVector()) {
5785    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5786                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5787                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5788                                Op.getOperand(0)),
5789                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5790  } else {
5791    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5792  }
5793}
5794
5795SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
5796  LLVMContext *Context = DAG.getContext();
5797  SDValue Op0 = Op.getOperand(0);
5798  SDValue Op1 = Op.getOperand(1);
5799  DebugLoc dl = Op.getDebugLoc();
5800  EVT VT = Op.getValueType();
5801  EVT SrcVT = Op1.getValueType();
5802
5803  // If second operand is smaller, extend it first.
5804  if (SrcVT.bitsLT(VT)) {
5805    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5806    SrcVT = VT;
5807  }
5808  // And if it is bigger, shrink it first.
5809  if (SrcVT.bitsGT(VT)) {
5810    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5811    SrcVT = VT;
5812  }
5813
5814  // At this point the operands and the result should have the same
5815  // type, and that won't be f80 since that is not custom lowered.
5816
5817  // First get the sign bit of second operand.
5818  std::vector<Constant*> CV;
5819  if (SrcVT == MVT::f64) {
5820    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5821    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5822  } else {
5823    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5824    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5825    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5826    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5827  }
5828  Constant *C = ConstantVector::get(CV);
5829  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5830  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5831                              PseudoSourceValue::getConstantPool(), 0,
5832                              false, false, 16);
5833  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5834
5835  // Shift sign bit right or left if the two operands have different types.
5836  if (SrcVT.bitsGT(VT)) {
5837    // Op0 is MVT::f32, Op1 is MVT::f64.
5838    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5839    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5840                          DAG.getConstant(32, MVT::i32));
5841    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5842    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5843                          DAG.getIntPtrConstant(0));
5844  }
5845
5846  // Clear first operand sign bit.
5847  CV.clear();
5848  if (VT == MVT::f64) {
5849    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
5850    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5851  } else {
5852    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
5853    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5854    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5855    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5856  }
5857  C = ConstantVector::get(CV);
5858  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5859  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5860                              PseudoSourceValue::getConstantPool(), 0,
5861                              false, false, 16);
5862  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
5863
5864  // Or the value with the sign bit.
5865  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
5866}
5867
5868/// Emit nodes that will be selected as "test Op0,Op0", or something
5869/// equivalent.
5870SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
5871                                    SelectionDAG &DAG) {
5872  DebugLoc dl = Op.getDebugLoc();
5873
5874  // CF and OF aren't always set the way we want. Determine which
5875  // of these we need.
5876  bool NeedCF = false;
5877  bool NeedOF = false;
5878  switch (X86CC) {
5879  case X86::COND_A: case X86::COND_AE:
5880  case X86::COND_B: case X86::COND_BE:
5881    NeedCF = true;
5882    break;
5883  case X86::COND_G: case X86::COND_GE:
5884  case X86::COND_L: case X86::COND_LE:
5885  case X86::COND_O: case X86::COND_NO:
5886    NeedOF = true;
5887    break;
5888  default: break;
5889  }
5890
5891  // See if we can use the EFLAGS value from the operand instead of
5892  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
5893  // we prove that the arithmetic won't overflow, we can't use OF or CF.
5894  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
5895    unsigned Opcode = 0;
5896    unsigned NumOperands = 0;
5897    switch (Op.getNode()->getOpcode()) {
5898    case ISD::ADD:
5899      // Due to an isel shortcoming, be conservative if this add is likely to
5900      // be selected as part of a load-modify-store instruction. When the root
5901      // node in a match is a store, isel doesn't know how to remap non-chain
5902      // non-flag uses of other nodes in the match, such as the ADD in this
5903      // case. This leads to the ADD being left around and reselected, with
5904      // the result being two adds in the output.
5905      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5906           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5907        if (UI->getOpcode() == ISD::STORE)
5908          goto default_case;
5909      if (ConstantSDNode *C =
5910            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
5911        // An add of one will be selected as an INC.
5912        if (C->getAPIntValue() == 1) {
5913          Opcode = X86ISD::INC;
5914          NumOperands = 1;
5915          break;
5916        }
5917        // An add of negative one (subtract of one) will be selected as a DEC.
5918        if (C->getAPIntValue().isAllOnesValue()) {
5919          Opcode = X86ISD::DEC;
5920          NumOperands = 1;
5921          break;
5922        }
5923      }
5924      // Otherwise use a regular EFLAGS-setting add.
5925      Opcode = X86ISD::ADD;
5926      NumOperands = 2;
5927      break;
5928    case ISD::AND: {
5929      // If the primary and result isn't used, don't bother using X86ISD::AND,
5930      // because a TEST instruction will be better.
5931      bool NonFlagUse = false;
5932      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5933             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
5934        SDNode *User = *UI;
5935        unsigned UOpNo = UI.getOperandNo();
5936        if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
5937          // Look pass truncate.
5938          UOpNo = User->use_begin().getOperandNo();
5939          User = *User->use_begin();
5940        }
5941        if (User->getOpcode() != ISD::BRCOND &&
5942            User->getOpcode() != ISD::SETCC &&
5943            (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
5944          NonFlagUse = true;
5945          break;
5946        }
5947      }
5948      if (!NonFlagUse)
5949        break;
5950    }
5951    // FALL THROUGH
5952    case ISD::SUB:
5953    case ISD::OR:
5954    case ISD::XOR:
5955      // Due to the ISEL shortcoming noted above, be conservative if this op is
5956      // likely to be selected as part of a load-modify-store instruction.
5957      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5958           UE = Op.getNode()->use_end(); UI != UE; ++UI)
5959        if (UI->getOpcode() == ISD::STORE)
5960          goto default_case;
5961      // Otherwise use a regular EFLAGS-setting instruction.
5962      switch (Op.getNode()->getOpcode()) {
5963      case ISD::SUB: Opcode = X86ISD::SUB; break;
5964      case ISD::OR:  Opcode = X86ISD::OR;  break;
5965      case ISD::XOR: Opcode = X86ISD::XOR; break;
5966      case ISD::AND: Opcode = X86ISD::AND; break;
5967      default: llvm_unreachable("unexpected operator!");
5968      }
5969      NumOperands = 2;
5970      break;
5971    case X86ISD::ADD:
5972    case X86ISD::SUB:
5973    case X86ISD::INC:
5974    case X86ISD::DEC:
5975    case X86ISD::OR:
5976    case X86ISD::XOR:
5977    case X86ISD::AND:
5978      return SDValue(Op.getNode(), 1);
5979    default:
5980    default_case:
5981      break;
5982    }
5983    if (Opcode != 0) {
5984      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5985      SmallVector<SDValue, 4> Ops;
5986      for (unsigned i = 0; i != NumOperands; ++i)
5987        Ops.push_back(Op.getOperand(i));
5988      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
5989      DAG.ReplaceAllUsesWith(Op, New);
5990      return SDValue(New.getNode(), 1);
5991    }
5992  }
5993
5994  // Otherwise just emit a CMP with 0, which is the TEST pattern.
5995  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
5996                     DAG.getConstant(0, Op.getValueType()));
5997}
5998
5999/// Emit nodes that will be selected as "cmp Op0,Op1", or something
6000/// equivalent.
6001SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
6002                                   SelectionDAG &DAG) {
6003  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
6004    if (C->getAPIntValue() == 0)
6005      return EmitTest(Op0, X86CC, DAG);
6006
6007  DebugLoc dl = Op0.getDebugLoc();
6008  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
6009}
6010
6011/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
6012/// if it's possible.
6013static SDValue LowerToBT(SDValue And, ISD::CondCode CC,
6014                         DebugLoc dl, SelectionDAG &DAG) {
6015  SDValue Op0 = And.getOperand(0);
6016  SDValue Op1 = And.getOperand(1);
6017  if (Op0.getOpcode() == ISD::TRUNCATE)
6018    Op0 = Op0.getOperand(0);
6019  if (Op1.getOpcode() == ISD::TRUNCATE)
6020    Op1 = Op1.getOperand(0);
6021
6022  SDValue LHS, RHS;
6023  if (Op1.getOpcode() == ISD::SHL) {
6024    if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0)))
6025      if (And10C->getZExtValue() == 1) {
6026        LHS = Op0;
6027        RHS = Op1.getOperand(1);
6028      }
6029  } else if (Op0.getOpcode() == ISD::SHL) {
6030    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
6031      if (And00C->getZExtValue() == 1) {
6032        LHS = Op1;
6033        RHS = Op0.getOperand(1);
6034      }
6035  } else if (Op1.getOpcode() == ISD::Constant) {
6036    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
6037    SDValue AndLHS = Op0;
6038    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
6039      LHS = AndLHS.getOperand(0);
6040      RHS = AndLHS.getOperand(1);
6041    }
6042  }
6043
6044  if (LHS.getNode()) {
6045    // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
6046    // instruction.  Since the shift amount is in-range-or-undefined, we know
6047    // that doing a bittest on the i16 value is ok.  We extend to i32 because
6048    // the encoding for the i16 version is larger than the i32 version.
6049    if (LHS.getValueType() == MVT::i8)
6050      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
6051
6052    // If the operand types disagree, extend the shift amount to match.  Since
6053    // BT ignores high bits (like shifts) we can use anyextend.
6054    if (LHS.getValueType() != RHS.getValueType())
6055      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
6056
6057    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
6058    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
6059    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6060                       DAG.getConstant(Cond, MVT::i8), BT);
6061  }
6062
6063  return SDValue();
6064}
6065
6066SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
6067  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
6068  SDValue Op0 = Op.getOperand(0);
6069  SDValue Op1 = Op.getOperand(1);
6070  DebugLoc dl = Op.getDebugLoc();
6071  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6072
6073  // Optimize to BT if possible.
6074  // Lower (X & (1 << N)) == 0 to BT(X, N).
6075  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
6076  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
6077  if (Op0.getOpcode() == ISD::AND &&
6078      Op0.hasOneUse() &&
6079      Op1.getOpcode() == ISD::Constant &&
6080      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
6081      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6082    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
6083    if (NewSetCC.getNode())
6084      return NewSetCC;
6085  }
6086
6087  // Look for "(setcc) == / != 1" to avoid unncessary setcc.
6088  if (Op0.getOpcode() == X86ISD::SETCC &&
6089      Op1.getOpcode() == ISD::Constant &&
6090      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
6091       cast<ConstantSDNode>(Op1)->isNullValue()) &&
6092      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6093    X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
6094    bool Invert = (CC == ISD::SETNE) ^
6095      cast<ConstantSDNode>(Op1)->isNullValue();
6096    if (Invert)
6097      CCode = X86::GetOppositeBranchCondition(CCode);
6098    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6099                       DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
6100  }
6101
6102  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
6103  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
6104  if (X86CC == X86::COND_INVALID)
6105    return SDValue();
6106
6107  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
6108
6109  // Use sbb x, x to materialize carry bit into a GPR.
6110  if (X86CC == X86::COND_B)
6111    return DAG.getNode(ISD::AND, dl, MVT::i8,
6112                       DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
6113                                   DAG.getConstant(X86CC, MVT::i8), Cond),
6114                       DAG.getConstant(1, MVT::i8));
6115
6116  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6117                     DAG.getConstant(X86CC, MVT::i8), Cond);
6118}
6119
6120SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
6121  SDValue Cond;
6122  SDValue Op0 = Op.getOperand(0);
6123  SDValue Op1 = Op.getOperand(1);
6124  SDValue CC = Op.getOperand(2);
6125  EVT VT = Op.getValueType();
6126  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6127  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
6128  DebugLoc dl = Op.getDebugLoc();
6129
6130  if (isFP) {
6131    unsigned SSECC = 8;
6132    EVT VT0 = Op0.getValueType();
6133    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
6134    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
6135    bool Swap = false;
6136
6137    switch (SetCCOpcode) {
6138    default: break;
6139    case ISD::SETOEQ:
6140    case ISD::SETEQ:  SSECC = 0; break;
6141    case ISD::SETOGT:
6142    case ISD::SETGT: Swap = true; // Fallthrough
6143    case ISD::SETLT:
6144    case ISD::SETOLT: SSECC = 1; break;
6145    case ISD::SETOGE:
6146    case ISD::SETGE: Swap = true; // Fallthrough
6147    case ISD::SETLE:
6148    case ISD::SETOLE: SSECC = 2; break;
6149    case ISD::SETUO:  SSECC = 3; break;
6150    case ISD::SETUNE:
6151    case ISD::SETNE:  SSECC = 4; break;
6152    case ISD::SETULE: Swap = true;
6153    case ISD::SETUGE: SSECC = 5; break;
6154    case ISD::SETULT: Swap = true;
6155    case ISD::SETUGT: SSECC = 6; break;
6156    case ISD::SETO:   SSECC = 7; break;
6157    }
6158    if (Swap)
6159      std::swap(Op0, Op1);
6160
6161    // In the two special cases we can't handle, emit two comparisons.
6162    if (SSECC == 8) {
6163      if (SetCCOpcode == ISD::SETUEQ) {
6164        SDValue UNORD, EQ;
6165        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
6166        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
6167        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
6168      }
6169      else if (SetCCOpcode == ISD::SETONE) {
6170        SDValue ORD, NEQ;
6171        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
6172        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
6173        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
6174      }
6175      llvm_unreachable("Illegal FP comparison");
6176    }
6177    // Handle all other FP comparisons here.
6178    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
6179  }
6180
6181  // We are handling one of the integer comparisons here.  Since SSE only has
6182  // GT and EQ comparisons for integer, swapping operands and multiple
6183  // operations may be required for some comparisons.
6184  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
6185  bool Swap = false, Invert = false, FlipSigns = false;
6186
6187  switch (VT.getSimpleVT().SimpleTy) {
6188  default: break;
6189  case MVT::v8i8:
6190  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
6191  case MVT::v4i16:
6192  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
6193  case MVT::v2i32:
6194  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
6195  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
6196  }
6197
6198  switch (SetCCOpcode) {
6199  default: break;
6200  case ISD::SETNE:  Invert = true;
6201  case ISD::SETEQ:  Opc = EQOpc; break;
6202  case ISD::SETLT:  Swap = true;
6203  case ISD::SETGT:  Opc = GTOpc; break;
6204  case ISD::SETGE:  Swap = true;
6205  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
6206  case ISD::SETULT: Swap = true;
6207  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
6208  case ISD::SETUGE: Swap = true;
6209  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
6210  }
6211  if (Swap)
6212    std::swap(Op0, Op1);
6213
6214  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
6215  // bits of the inputs before performing those operations.
6216  if (FlipSigns) {
6217    EVT EltVT = VT.getVectorElementType();
6218    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
6219                                      EltVT);
6220    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
6221    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
6222                                    SignBits.size());
6223    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
6224    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
6225  }
6226
6227  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
6228
6229  // If the logical-not of the result is required, perform that now.
6230  if (Invert)
6231    Result = DAG.getNOT(dl, Result, VT);
6232
6233  return Result;
6234}
6235
6236// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
6237static bool isX86LogicalCmp(SDValue Op) {
6238  unsigned Opc = Op.getNode()->getOpcode();
6239  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
6240    return true;
6241  if (Op.getResNo() == 1 &&
6242      (Opc == X86ISD::ADD ||
6243       Opc == X86ISD::SUB ||
6244       Opc == X86ISD::SMUL ||
6245       Opc == X86ISD::UMUL ||
6246       Opc == X86ISD::INC ||
6247       Opc == X86ISD::DEC ||
6248       Opc == X86ISD::OR ||
6249       Opc == X86ISD::XOR ||
6250       Opc == X86ISD::AND))
6251    return true;
6252
6253  return false;
6254}
6255
6256SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
6257  bool addTest = true;
6258  SDValue Cond  = Op.getOperand(0);
6259  DebugLoc dl = Op.getDebugLoc();
6260  SDValue CC;
6261
6262  if (Cond.getOpcode() == ISD::SETCC) {
6263    SDValue NewCond = LowerSETCC(Cond, DAG);
6264    if (NewCond.getNode())
6265      Cond = NewCond;
6266  }
6267
6268  // (select (x == 0), -1, 0) -> (sign_bit (x - 1))
6269  SDValue Op1 = Op.getOperand(1);
6270  SDValue Op2 = Op.getOperand(2);
6271  if (Cond.getOpcode() == X86ISD::SETCC &&
6272      cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
6273    SDValue Cmp = Cond.getOperand(1);
6274    if (Cmp.getOpcode() == X86ISD::CMP) {
6275      ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
6276      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
6277      ConstantSDNode *RHSC =
6278        dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
6279      if (N1C && N1C->isAllOnesValue() &&
6280          N2C && N2C->isNullValue() &&
6281          RHSC && RHSC->isNullValue()) {
6282        SDValue CmpOp0 = Cmp.getOperand(0);
6283        Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
6284                          CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
6285        return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
6286                           DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
6287      }
6288    }
6289  }
6290
6291  // Look pass (and (setcc_carry (cmp ...)), 1).
6292  if (Cond.getOpcode() == ISD::AND &&
6293      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6294    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6295    if (C && C->getAPIntValue() == 1)
6296      Cond = Cond.getOperand(0);
6297  }
6298
6299  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6300  // setting operand in place of the X86ISD::SETCC.
6301  if (Cond.getOpcode() == X86ISD::SETCC ||
6302      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6303    CC = Cond.getOperand(0);
6304
6305    SDValue Cmp = Cond.getOperand(1);
6306    unsigned Opc = Cmp.getOpcode();
6307    EVT VT = Op.getValueType();
6308
6309    bool IllegalFPCMov = false;
6310    if (VT.isFloatingPoint() && !VT.isVector() &&
6311        !isScalarFPTypeInSSEReg(VT))  // FPStack?
6312      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
6313
6314    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
6315        Opc == X86ISD::BT) { // FIXME
6316      Cond = Cmp;
6317      addTest = false;
6318    }
6319  }
6320
6321  if (addTest) {
6322    // Look pass the truncate.
6323    if (Cond.getOpcode() == ISD::TRUNCATE)
6324      Cond = Cond.getOperand(0);
6325
6326    // We know the result of AND is compared against zero. Try to match
6327    // it to BT.
6328    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6329      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6330      if (NewSetCC.getNode()) {
6331        CC = NewSetCC.getOperand(0);
6332        Cond = NewSetCC.getOperand(1);
6333        addTest = false;
6334      }
6335    }
6336  }
6337
6338  if (addTest) {
6339    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6340    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6341  }
6342
6343  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
6344  // condition is true.
6345  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
6346  SDValue Ops[] = { Op2, Op1, CC, Cond };
6347  return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
6348}
6349
6350// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
6351// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
6352// from the AND / OR.
6353static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
6354  Opc = Op.getOpcode();
6355  if (Opc != ISD::OR && Opc != ISD::AND)
6356    return false;
6357  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6358          Op.getOperand(0).hasOneUse() &&
6359          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
6360          Op.getOperand(1).hasOneUse());
6361}
6362
6363// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
6364// 1 and that the SETCC node has a single use.
6365static bool isXor1OfSetCC(SDValue Op) {
6366  if (Op.getOpcode() != ISD::XOR)
6367    return false;
6368  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6369  if (N1C && N1C->getAPIntValue() == 1) {
6370    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6371      Op.getOperand(0).hasOneUse();
6372  }
6373  return false;
6374}
6375
6376SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6377  bool addTest = true;
6378  SDValue Chain = Op.getOperand(0);
6379  SDValue Cond  = Op.getOperand(1);
6380  SDValue Dest  = Op.getOperand(2);
6381  DebugLoc dl = Op.getDebugLoc();
6382  SDValue CC;
6383
6384  if (Cond.getOpcode() == ISD::SETCC) {
6385    SDValue NewCond = LowerSETCC(Cond, DAG);
6386    if (NewCond.getNode())
6387      Cond = NewCond;
6388  }
6389#if 0
6390  // FIXME: LowerXALUO doesn't handle these!!
6391  else if (Cond.getOpcode() == X86ISD::ADD  ||
6392           Cond.getOpcode() == X86ISD::SUB  ||
6393           Cond.getOpcode() == X86ISD::SMUL ||
6394           Cond.getOpcode() == X86ISD::UMUL)
6395    Cond = LowerXALUO(Cond, DAG);
6396#endif
6397
6398  // Look pass (and (setcc_carry (cmp ...)), 1).
6399  if (Cond.getOpcode() == ISD::AND &&
6400      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6401    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6402    if (C && C->getAPIntValue() == 1)
6403      Cond = Cond.getOperand(0);
6404  }
6405
6406  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6407  // setting operand in place of the X86ISD::SETCC.
6408  if (Cond.getOpcode() == X86ISD::SETCC ||
6409      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6410    CC = Cond.getOperand(0);
6411
6412    SDValue Cmp = Cond.getOperand(1);
6413    unsigned Opc = Cmp.getOpcode();
6414    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
6415    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
6416      Cond = Cmp;
6417      addTest = false;
6418    } else {
6419      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
6420      default: break;
6421      case X86::COND_O:
6422      case X86::COND_B:
6423        // These can only come from an arithmetic instruction with overflow,
6424        // e.g. SADDO, UADDO.
6425        Cond = Cond.getNode()->getOperand(1);
6426        addTest = false;
6427        break;
6428      }
6429    }
6430  } else {
6431    unsigned CondOpc;
6432    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
6433      SDValue Cmp = Cond.getOperand(0).getOperand(1);
6434      if (CondOpc == ISD::OR) {
6435        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
6436        // two branches instead of an explicit OR instruction with a
6437        // separate test.
6438        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6439            isX86LogicalCmp(Cmp)) {
6440          CC = Cond.getOperand(0).getOperand(0);
6441          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6442                              Chain, Dest, CC, Cmp);
6443          CC = Cond.getOperand(1).getOperand(0);
6444          Cond = Cmp;
6445          addTest = false;
6446        }
6447      } else { // ISD::AND
6448        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
6449        // two branches instead of an explicit AND instruction with a
6450        // separate test. However, we only do this if this block doesn't
6451        // have a fall-through edge, because this requires an explicit
6452        // jmp when the condition is false.
6453        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6454            isX86LogicalCmp(Cmp) &&
6455            Op.getNode()->hasOneUse()) {
6456          X86::CondCode CCode =
6457            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6458          CCode = X86::GetOppositeBranchCondition(CCode);
6459          CC = DAG.getConstant(CCode, MVT::i8);
6460          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
6461          // Look for an unconditional branch following this conditional branch.
6462          // We need this because we need to reverse the successors in order
6463          // to implement FCMP_OEQ.
6464          if (User.getOpcode() == ISD::BR) {
6465            SDValue FalseBB = User.getOperand(1);
6466            SDValue NewBR =
6467              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
6468            assert(NewBR == User);
6469            Dest = FalseBB;
6470
6471            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6472                                Chain, Dest, CC, Cmp);
6473            X86::CondCode CCode =
6474              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
6475            CCode = X86::GetOppositeBranchCondition(CCode);
6476            CC = DAG.getConstant(CCode, MVT::i8);
6477            Cond = Cmp;
6478            addTest = false;
6479          }
6480        }
6481      }
6482    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
6483      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
6484      // It should be transformed during dag combiner except when the condition
6485      // is set by a arithmetics with overflow node.
6486      X86::CondCode CCode =
6487        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6488      CCode = X86::GetOppositeBranchCondition(CCode);
6489      CC = DAG.getConstant(CCode, MVT::i8);
6490      Cond = Cond.getOperand(0).getOperand(1);
6491      addTest = false;
6492    }
6493  }
6494
6495  if (addTest) {
6496    // Look pass the truncate.
6497    if (Cond.getOpcode() == ISD::TRUNCATE)
6498      Cond = Cond.getOperand(0);
6499
6500    // We know the result of AND is compared against zero. Try to match
6501    // it to BT.
6502    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6503      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6504      if (NewSetCC.getNode()) {
6505        CC = NewSetCC.getOperand(0);
6506        Cond = NewSetCC.getOperand(1);
6507        addTest = false;
6508      }
6509    }
6510  }
6511
6512  if (addTest) {
6513    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6514    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6515  }
6516  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6517                     Chain, Dest, CC, Cond);
6518}
6519
6520
6521// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
6522// Calls to _alloca is needed to probe the stack when allocating more than 4k
6523// bytes in one go. Touching the stack at 4K increments is necessary to ensure
6524// that the guard pages used by the OS virtual memory manager are allocated in
6525// correct sequence.
6526SDValue
6527X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
6528                                           SelectionDAG &DAG) {
6529  assert(Subtarget->isTargetCygMing() &&
6530         "This should be used only on Cygwin/Mingw targets");
6531  DebugLoc dl = Op.getDebugLoc();
6532
6533  // Get the inputs.
6534  SDValue Chain = Op.getOperand(0);
6535  SDValue Size  = Op.getOperand(1);
6536  // FIXME: Ensure alignment here
6537
6538  SDValue Flag;
6539
6540  EVT IntPtr = getPointerTy();
6541  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
6542
6543  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
6544  Flag = Chain.getValue(1);
6545
6546  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
6547
6548  Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag);
6549  Flag = Chain.getValue(1);
6550
6551  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
6552
6553  SDValue Ops1[2] = { Chain.getValue(0), Chain };
6554  return DAG.getMergeValues(Ops1, 2, dl);
6555}
6556
6557SDValue
6558X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
6559                                           SDValue Chain,
6560                                           SDValue Dst, SDValue Src,
6561                                           SDValue Size, unsigned Align,
6562                                           bool isVolatile,
6563                                           const Value *DstSV,
6564                                           uint64_t DstSVOff) {
6565  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
6566
6567  // If not DWORD aligned or size is more than the threshold, call the library.
6568  // The libc version is likely to be faster for these cases. It can use the
6569  // address value and run time information about the CPU.
6570  if ((Align & 3) != 0 ||
6571      !ConstantSize ||
6572      ConstantSize->getZExtValue() >
6573        getSubtarget()->getMaxInlineSizeThreshold()) {
6574    SDValue InFlag(0, 0);
6575
6576    // Check to see if there is a specialized entry-point for memory zeroing.
6577    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
6578
6579    if (const char *bzeroEntry =  V &&
6580        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
6581      EVT IntPtr = getPointerTy();
6582      const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext());
6583      TargetLowering::ArgListTy Args;
6584      TargetLowering::ArgListEntry Entry;
6585      Entry.Node = Dst;
6586      Entry.Ty = IntPtrTy;
6587      Args.push_back(Entry);
6588      Entry.Node = Size;
6589      Args.push_back(Entry);
6590      std::pair<SDValue,SDValue> CallResult =
6591        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
6592                    false, false, false, false,
6593                    0, CallingConv::C, false, /*isReturnValueUsed=*/false,
6594                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
6595      return CallResult.second;
6596    }
6597
6598    // Otherwise have the target-independent code call memset.
6599    return SDValue();
6600  }
6601
6602  uint64_t SizeVal = ConstantSize->getZExtValue();
6603  SDValue InFlag(0, 0);
6604  EVT AVT;
6605  SDValue Count;
6606  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
6607  unsigned BytesLeft = 0;
6608  bool TwoRepStos = false;
6609  if (ValC) {
6610    unsigned ValReg;
6611    uint64_t Val = ValC->getZExtValue() & 255;
6612
6613    // If the value is a constant, then we can potentially use larger sets.
6614    switch (Align & 3) {
6615    case 2:   // WORD aligned
6616      AVT = MVT::i16;
6617      ValReg = X86::AX;
6618      Val = (Val << 8) | Val;
6619      break;
6620    case 0:  // DWORD aligned
6621      AVT = MVT::i32;
6622      ValReg = X86::EAX;
6623      Val = (Val << 8)  | Val;
6624      Val = (Val << 16) | Val;
6625      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
6626        AVT = MVT::i64;
6627        ValReg = X86::RAX;
6628        Val = (Val << 32) | Val;
6629      }
6630      break;
6631    default:  // Byte aligned
6632      AVT = MVT::i8;
6633      ValReg = X86::AL;
6634      Count = DAG.getIntPtrConstant(SizeVal);
6635      break;
6636    }
6637
6638    if (AVT.bitsGT(MVT::i8)) {
6639      unsigned UBytes = AVT.getSizeInBits() / 8;
6640      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
6641      BytesLeft = SizeVal % UBytes;
6642    }
6643
6644    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
6645                              InFlag);
6646    InFlag = Chain.getValue(1);
6647  } else {
6648    AVT = MVT::i8;
6649    Count  = DAG.getIntPtrConstant(SizeVal);
6650    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
6651    InFlag = Chain.getValue(1);
6652  }
6653
6654  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
6655                                                              X86::ECX,
6656                            Count, InFlag);
6657  InFlag = Chain.getValue(1);
6658  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6659                                                              X86::EDI,
6660                            Dst, InFlag);
6661  InFlag = Chain.getValue(1);
6662
6663  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6664  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
6665  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
6666
6667  if (TwoRepStos) {
6668    InFlag = Chain.getValue(1);
6669    Count  = Size;
6670    EVT CVT = Count.getValueType();
6671    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
6672                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
6673    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
6674                                                             X86::ECX,
6675                              Left, InFlag);
6676    InFlag = Chain.getValue(1);
6677    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6678    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
6679    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
6680  } else if (BytesLeft) {
6681    // Handle the last 1 - 7 bytes.
6682    unsigned Offset = SizeVal - BytesLeft;
6683    EVT AddrVT = Dst.getValueType();
6684    EVT SizeVT = Size.getValueType();
6685
6686    Chain = DAG.getMemset(Chain, dl,
6687                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
6688                                      DAG.getConstant(Offset, AddrVT)),
6689                          Src,
6690                          DAG.getConstant(BytesLeft, SizeVT),
6691                          Align, isVolatile, DstSV, DstSVOff + Offset);
6692  }
6693
6694  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
6695  return Chain;
6696}
6697
6698SDValue
6699X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
6700                                      SDValue Chain, SDValue Dst, SDValue Src,
6701                                      SDValue Size, unsigned Align,
6702                                      bool isVolatile, bool AlwaysInline,
6703                                      const Value *DstSV, uint64_t DstSVOff,
6704                                      const Value *SrcSV, uint64_t SrcSVOff) {
6705  // This requires the copy size to be a constant, preferrably
6706  // within a subtarget-specific limit.
6707  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
6708  if (!ConstantSize)
6709    return SDValue();
6710  uint64_t SizeVal = ConstantSize->getZExtValue();
6711  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
6712    return SDValue();
6713
6714  /// If not DWORD aligned, call the library.
6715  if ((Align & 3) != 0)
6716    return SDValue();
6717
6718  // DWORD aligned
6719  EVT AVT = MVT::i32;
6720  if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
6721    AVT = MVT::i64;
6722
6723  unsigned UBytes = AVT.getSizeInBits() / 8;
6724  unsigned CountVal = SizeVal / UBytes;
6725  SDValue Count = DAG.getIntPtrConstant(CountVal);
6726  unsigned BytesLeft = SizeVal % UBytes;
6727
6728  SDValue InFlag(0, 0);
6729  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
6730                                                              X86::ECX,
6731                            Count, InFlag);
6732  InFlag = Chain.getValue(1);
6733  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6734                                                              X86::EDI,
6735                            Dst, InFlag);
6736  InFlag = Chain.getValue(1);
6737  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
6738                                                              X86::ESI,
6739                            Src, InFlag);
6740  InFlag = Chain.getValue(1);
6741
6742  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6743  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
6744  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
6745                                array_lengthof(Ops));
6746
6747  SmallVector<SDValue, 4> Results;
6748  Results.push_back(RepMovs);
6749  if (BytesLeft) {
6750    // Handle the last 1 - 7 bytes.
6751    unsigned Offset = SizeVal - BytesLeft;
6752    EVT DstVT = Dst.getValueType();
6753    EVT SrcVT = Src.getValueType();
6754    EVT SizeVT = Size.getValueType();
6755    Results.push_back(DAG.getMemcpy(Chain, dl,
6756                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
6757                                                DAG.getConstant(Offset, DstVT)),
6758                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
6759                                                DAG.getConstant(Offset, SrcVT)),
6760                                    DAG.getConstant(BytesLeft, SizeVT),
6761                                    Align, isVolatile, AlwaysInline,
6762                                    DstSV, DstSVOff + Offset,
6763                                    SrcSV, SrcSVOff + Offset));
6764  }
6765
6766  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6767                     &Results[0], Results.size());
6768}
6769
6770SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
6771  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6772  DebugLoc dl = Op.getDebugLoc();
6773
6774  if (!Subtarget->is64Bit()) {
6775    // vastart just stores the address of the VarArgsFrameIndex slot into the
6776    // memory location argument.
6777    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6778    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
6779                        false, false, 0);
6780  }
6781
6782  // __va_list_tag:
6783  //   gp_offset         (0 - 6 * 8)
6784  //   fp_offset         (48 - 48 + 8 * 16)
6785  //   overflow_arg_area (point to parameters coming in memory).
6786  //   reg_save_area
6787  SmallVector<SDValue, 8> MemOps;
6788  SDValue FIN = Op.getOperand(1);
6789  // Store gp_offset
6790  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6791                               DAG.getConstant(VarArgsGPOffset, MVT::i32),
6792                               FIN, SV, 0, false, false, 0);
6793  MemOps.push_back(Store);
6794
6795  // Store fp_offset
6796  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6797                    FIN, DAG.getIntPtrConstant(4));
6798  Store = DAG.getStore(Op.getOperand(0), dl,
6799                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
6800                       FIN, SV, 0, false, false, 0);
6801  MemOps.push_back(Store);
6802
6803  // Store ptr to overflow_arg_area
6804  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6805                    FIN, DAG.getIntPtrConstant(4));
6806  SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6807  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
6808                       false, false, 0);
6809  MemOps.push_back(Store);
6810
6811  // Store ptr to reg_save_area.
6812  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6813                    FIN, DAG.getIntPtrConstant(8));
6814  SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
6815  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
6816                       false, false, 0);
6817  MemOps.push_back(Store);
6818  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6819                     &MemOps[0], MemOps.size());
6820}
6821
6822SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
6823  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6824  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6825  SDValue Chain = Op.getOperand(0);
6826  SDValue SrcPtr = Op.getOperand(1);
6827  SDValue SrcSV = Op.getOperand(2);
6828
6829  report_fatal_error("VAArgInst is not yet implemented for x86-64!");
6830  return SDValue();
6831}
6832
6833SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
6834  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6835  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6836  SDValue Chain = Op.getOperand(0);
6837  SDValue DstPtr = Op.getOperand(1);
6838  SDValue SrcPtr = Op.getOperand(2);
6839  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6840  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6841  DebugLoc dl = Op.getDebugLoc();
6842
6843  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6844                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
6845                       false, DstSV, 0, SrcSV, 0);
6846}
6847
6848SDValue
6849X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
6850  DebugLoc dl = Op.getDebugLoc();
6851  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6852  switch (IntNo) {
6853  default: return SDValue();    // Don't custom lower most intrinsics.
6854  // Comparison intrinsics.
6855  case Intrinsic::x86_sse_comieq_ss:
6856  case Intrinsic::x86_sse_comilt_ss:
6857  case Intrinsic::x86_sse_comile_ss:
6858  case Intrinsic::x86_sse_comigt_ss:
6859  case Intrinsic::x86_sse_comige_ss:
6860  case Intrinsic::x86_sse_comineq_ss:
6861  case Intrinsic::x86_sse_ucomieq_ss:
6862  case Intrinsic::x86_sse_ucomilt_ss:
6863  case Intrinsic::x86_sse_ucomile_ss:
6864  case Intrinsic::x86_sse_ucomigt_ss:
6865  case Intrinsic::x86_sse_ucomige_ss:
6866  case Intrinsic::x86_sse_ucomineq_ss:
6867  case Intrinsic::x86_sse2_comieq_sd:
6868  case Intrinsic::x86_sse2_comilt_sd:
6869  case Intrinsic::x86_sse2_comile_sd:
6870  case Intrinsic::x86_sse2_comigt_sd:
6871  case Intrinsic::x86_sse2_comige_sd:
6872  case Intrinsic::x86_sse2_comineq_sd:
6873  case Intrinsic::x86_sse2_ucomieq_sd:
6874  case Intrinsic::x86_sse2_ucomilt_sd:
6875  case Intrinsic::x86_sse2_ucomile_sd:
6876  case Intrinsic::x86_sse2_ucomigt_sd:
6877  case Intrinsic::x86_sse2_ucomige_sd:
6878  case Intrinsic::x86_sse2_ucomineq_sd: {
6879    unsigned Opc = 0;
6880    ISD::CondCode CC = ISD::SETCC_INVALID;
6881    switch (IntNo) {
6882    default: break;
6883    case Intrinsic::x86_sse_comieq_ss:
6884    case Intrinsic::x86_sse2_comieq_sd:
6885      Opc = X86ISD::COMI;
6886      CC = ISD::SETEQ;
6887      break;
6888    case Intrinsic::x86_sse_comilt_ss:
6889    case Intrinsic::x86_sse2_comilt_sd:
6890      Opc = X86ISD::COMI;
6891      CC = ISD::SETLT;
6892      break;
6893    case Intrinsic::x86_sse_comile_ss:
6894    case Intrinsic::x86_sse2_comile_sd:
6895      Opc = X86ISD::COMI;
6896      CC = ISD::SETLE;
6897      break;
6898    case Intrinsic::x86_sse_comigt_ss:
6899    case Intrinsic::x86_sse2_comigt_sd:
6900      Opc = X86ISD::COMI;
6901      CC = ISD::SETGT;
6902      break;
6903    case Intrinsic::x86_sse_comige_ss:
6904    case Intrinsic::x86_sse2_comige_sd:
6905      Opc = X86ISD::COMI;
6906      CC = ISD::SETGE;
6907      break;
6908    case Intrinsic::x86_sse_comineq_ss:
6909    case Intrinsic::x86_sse2_comineq_sd:
6910      Opc = X86ISD::COMI;
6911      CC = ISD::SETNE;
6912      break;
6913    case Intrinsic::x86_sse_ucomieq_ss:
6914    case Intrinsic::x86_sse2_ucomieq_sd:
6915      Opc = X86ISD::UCOMI;
6916      CC = ISD::SETEQ;
6917      break;
6918    case Intrinsic::x86_sse_ucomilt_ss:
6919    case Intrinsic::x86_sse2_ucomilt_sd:
6920      Opc = X86ISD::UCOMI;
6921      CC = ISD::SETLT;
6922      break;
6923    case Intrinsic::x86_sse_ucomile_ss:
6924    case Intrinsic::x86_sse2_ucomile_sd:
6925      Opc = X86ISD::UCOMI;
6926      CC = ISD::SETLE;
6927      break;
6928    case Intrinsic::x86_sse_ucomigt_ss:
6929    case Intrinsic::x86_sse2_ucomigt_sd:
6930      Opc = X86ISD::UCOMI;
6931      CC = ISD::SETGT;
6932      break;
6933    case Intrinsic::x86_sse_ucomige_ss:
6934    case Intrinsic::x86_sse2_ucomige_sd:
6935      Opc = X86ISD::UCOMI;
6936      CC = ISD::SETGE;
6937      break;
6938    case Intrinsic::x86_sse_ucomineq_ss:
6939    case Intrinsic::x86_sse2_ucomineq_sd:
6940      Opc = X86ISD::UCOMI;
6941      CC = ISD::SETNE;
6942      break;
6943    }
6944
6945    SDValue LHS = Op.getOperand(1);
6946    SDValue RHS = Op.getOperand(2);
6947    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6948    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
6949    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6950    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6951                                DAG.getConstant(X86CC, MVT::i8), Cond);
6952    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6953  }
6954  // ptest intrinsics. The intrinsic these come from are designed to return
6955  // an integer value, not just an instruction so lower it to the ptest
6956  // pattern and a setcc for the result.
6957  case Intrinsic::x86_sse41_ptestz:
6958  case Intrinsic::x86_sse41_ptestc:
6959  case Intrinsic::x86_sse41_ptestnzc:{
6960    unsigned X86CC = 0;
6961    switch (IntNo) {
6962    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6963    case Intrinsic::x86_sse41_ptestz:
6964      // ZF = 1
6965      X86CC = X86::COND_E;
6966      break;
6967    case Intrinsic::x86_sse41_ptestc:
6968      // CF = 1
6969      X86CC = X86::COND_B;
6970      break;
6971    case Intrinsic::x86_sse41_ptestnzc:
6972      // ZF and CF = 0
6973      X86CC = X86::COND_A;
6974      break;
6975    }
6976
6977    SDValue LHS = Op.getOperand(1);
6978    SDValue RHS = Op.getOperand(2);
6979    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6980    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6981    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6982    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6983  }
6984
6985  // Fix vector shift instructions where the last operand is a non-immediate
6986  // i32 value.
6987  case Intrinsic::x86_sse2_pslli_w:
6988  case Intrinsic::x86_sse2_pslli_d:
6989  case Intrinsic::x86_sse2_pslli_q:
6990  case Intrinsic::x86_sse2_psrli_w:
6991  case Intrinsic::x86_sse2_psrli_d:
6992  case Intrinsic::x86_sse2_psrli_q:
6993  case Intrinsic::x86_sse2_psrai_w:
6994  case Intrinsic::x86_sse2_psrai_d:
6995  case Intrinsic::x86_mmx_pslli_w:
6996  case Intrinsic::x86_mmx_pslli_d:
6997  case Intrinsic::x86_mmx_pslli_q:
6998  case Intrinsic::x86_mmx_psrli_w:
6999  case Intrinsic::x86_mmx_psrli_d:
7000  case Intrinsic::x86_mmx_psrli_q:
7001  case Intrinsic::x86_mmx_psrai_w:
7002  case Intrinsic::x86_mmx_psrai_d: {
7003    SDValue ShAmt = Op.getOperand(2);
7004    if (isa<ConstantSDNode>(ShAmt))
7005      return SDValue();
7006
7007    unsigned NewIntNo = 0;
7008    EVT ShAmtVT = MVT::v4i32;
7009    switch (IntNo) {
7010    case Intrinsic::x86_sse2_pslli_w:
7011      NewIntNo = Intrinsic::x86_sse2_psll_w;
7012      break;
7013    case Intrinsic::x86_sse2_pslli_d:
7014      NewIntNo = Intrinsic::x86_sse2_psll_d;
7015      break;
7016    case Intrinsic::x86_sse2_pslli_q:
7017      NewIntNo = Intrinsic::x86_sse2_psll_q;
7018      break;
7019    case Intrinsic::x86_sse2_psrli_w:
7020      NewIntNo = Intrinsic::x86_sse2_psrl_w;
7021      break;
7022    case Intrinsic::x86_sse2_psrli_d:
7023      NewIntNo = Intrinsic::x86_sse2_psrl_d;
7024      break;
7025    case Intrinsic::x86_sse2_psrli_q:
7026      NewIntNo = Intrinsic::x86_sse2_psrl_q;
7027      break;
7028    case Intrinsic::x86_sse2_psrai_w:
7029      NewIntNo = Intrinsic::x86_sse2_psra_w;
7030      break;
7031    case Intrinsic::x86_sse2_psrai_d:
7032      NewIntNo = Intrinsic::x86_sse2_psra_d;
7033      break;
7034    default: {
7035      ShAmtVT = MVT::v2i32;
7036      switch (IntNo) {
7037      case Intrinsic::x86_mmx_pslli_w:
7038        NewIntNo = Intrinsic::x86_mmx_psll_w;
7039        break;
7040      case Intrinsic::x86_mmx_pslli_d:
7041        NewIntNo = Intrinsic::x86_mmx_psll_d;
7042        break;
7043      case Intrinsic::x86_mmx_pslli_q:
7044        NewIntNo = Intrinsic::x86_mmx_psll_q;
7045        break;
7046      case Intrinsic::x86_mmx_psrli_w:
7047        NewIntNo = Intrinsic::x86_mmx_psrl_w;
7048        break;
7049      case Intrinsic::x86_mmx_psrli_d:
7050        NewIntNo = Intrinsic::x86_mmx_psrl_d;
7051        break;
7052      case Intrinsic::x86_mmx_psrli_q:
7053        NewIntNo = Intrinsic::x86_mmx_psrl_q;
7054        break;
7055      case Intrinsic::x86_mmx_psrai_w:
7056        NewIntNo = Intrinsic::x86_mmx_psra_w;
7057        break;
7058      case Intrinsic::x86_mmx_psrai_d:
7059        NewIntNo = Intrinsic::x86_mmx_psra_d;
7060        break;
7061      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
7062      }
7063      break;
7064    }
7065    }
7066
7067    // The vector shift intrinsics with scalars uses 32b shift amounts but
7068    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
7069    // to be zero.
7070    SDValue ShOps[4];
7071    ShOps[0] = ShAmt;
7072    ShOps[1] = DAG.getConstant(0, MVT::i32);
7073    if (ShAmtVT == MVT::v4i32) {
7074      ShOps[2] = DAG.getUNDEF(MVT::i32);
7075      ShOps[3] = DAG.getUNDEF(MVT::i32);
7076      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
7077    } else {
7078      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
7079    }
7080
7081    EVT VT = Op.getValueType();
7082    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
7083    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7084                       DAG.getConstant(NewIntNo, MVT::i32),
7085                       Op.getOperand(1), ShAmt);
7086  }
7087  }
7088}
7089
7090SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
7091  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7092  DebugLoc dl = Op.getDebugLoc();
7093
7094  if (Depth > 0) {
7095    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7096    SDValue Offset =
7097      DAG.getConstant(TD->getPointerSize(),
7098                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
7099    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7100                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
7101                                   FrameAddr, Offset),
7102                       NULL, 0, false, false, 0);
7103  }
7104
7105  // Just load the return address.
7106  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
7107  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7108                     RetAddrFI, NULL, 0, false, false, 0);
7109}
7110
7111SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
7112  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7113  MFI->setFrameAddressIsTaken(true);
7114  EVT VT = Op.getValueType();
7115  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
7116  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7117  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
7118  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
7119  while (Depth--)
7120    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
7121                            false, false, 0);
7122  return FrameAddr;
7123}
7124
7125SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
7126                                                     SelectionDAG &DAG) {
7127  return DAG.getIntPtrConstant(2*TD->getPointerSize());
7128}
7129
7130SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
7131{
7132  MachineFunction &MF = DAG.getMachineFunction();
7133  SDValue Chain     = Op.getOperand(0);
7134  SDValue Offset    = Op.getOperand(1);
7135  SDValue Handler   = Op.getOperand(2);
7136  DebugLoc dl       = Op.getDebugLoc();
7137
7138  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
7139                                  getPointerTy());
7140  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
7141
7142  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
7143                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
7144  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
7145  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
7146  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
7147  MF.getRegInfo().addLiveOut(StoreAddrReg);
7148
7149  return DAG.getNode(X86ISD::EH_RETURN, dl,
7150                     MVT::Other,
7151                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
7152}
7153
7154SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
7155                                             SelectionDAG &DAG) {
7156  SDValue Root = Op.getOperand(0);
7157  SDValue Trmp = Op.getOperand(1); // trampoline
7158  SDValue FPtr = Op.getOperand(2); // nested function
7159  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7160  DebugLoc dl  = Op.getDebugLoc();
7161
7162  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7163
7164  if (Subtarget->is64Bit()) {
7165    SDValue OutChains[6];
7166
7167    // Large code-model.
7168    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
7169    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
7170
7171    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
7172    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
7173
7174    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
7175
7176    // Load the pointer to the nested function into R11.
7177    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
7178    SDValue Addr = Trmp;
7179    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7180                                Addr, TrmpAddr, 0, false, false, 0);
7181
7182    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7183                       DAG.getConstant(2, MVT::i64));
7184    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
7185                                false, false, 2);
7186
7187    // Load the 'nest' parameter value into R10.
7188    // R10 is specified in X86CallingConv.td
7189    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
7190    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7191                       DAG.getConstant(10, MVT::i64));
7192    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7193                                Addr, TrmpAddr, 10, false, false, 0);
7194
7195    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7196                       DAG.getConstant(12, MVT::i64));
7197    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
7198                                false, false, 2);
7199
7200    // Jump to the nested function.
7201    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
7202    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7203                       DAG.getConstant(20, MVT::i64));
7204    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7205                                Addr, TrmpAddr, 20, false, false, 0);
7206
7207    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
7208    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7209                       DAG.getConstant(22, MVT::i64));
7210    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
7211                                TrmpAddr, 22, false, false, 0);
7212
7213    SDValue Ops[] =
7214      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
7215    return DAG.getMergeValues(Ops, 2, dl);
7216  } else {
7217    const Function *Func =
7218      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7219    CallingConv::ID CC = Func->getCallingConv();
7220    unsigned NestReg;
7221
7222    switch (CC) {
7223    default:
7224      llvm_unreachable("Unsupported calling convention");
7225    case CallingConv::C:
7226    case CallingConv::X86_StdCall: {
7227      // Pass 'nest' parameter in ECX.
7228      // Must be kept in sync with X86CallingConv.td
7229      NestReg = X86::ECX;
7230
7231      // Check that ECX wasn't needed by an 'inreg' parameter.
7232      const FunctionType *FTy = Func->getFunctionType();
7233      const AttrListPtr &Attrs = Func->getAttributes();
7234
7235      if (!Attrs.isEmpty() && !Func->isVarArg()) {
7236        unsigned InRegCount = 0;
7237        unsigned Idx = 1;
7238
7239        for (FunctionType::param_iterator I = FTy->param_begin(),
7240             E = FTy->param_end(); I != E; ++I, ++Idx)
7241          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
7242            // FIXME: should only count parameters that are lowered to integers.
7243            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
7244
7245        if (InRegCount > 2) {
7246          report_fatal_error("Nest register in use - reduce number of inreg parameters!");
7247        }
7248      }
7249      break;
7250    }
7251    case CallingConv::X86_FastCall:
7252    case CallingConv::Fast:
7253      // Pass 'nest' parameter in EAX.
7254      // Must be kept in sync with X86CallingConv.td
7255      NestReg = X86::EAX;
7256      break;
7257    }
7258
7259    SDValue OutChains[4];
7260    SDValue Addr, Disp;
7261
7262    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7263                       DAG.getConstant(10, MVT::i32));
7264    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
7265
7266    // This is storing the opcode for MOV32ri.
7267    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
7268    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
7269    OutChains[0] = DAG.getStore(Root, dl,
7270                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
7271                                Trmp, TrmpAddr, 0, false, false, 0);
7272
7273    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7274                       DAG.getConstant(1, MVT::i32));
7275    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
7276                                false, false, 1);
7277
7278    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
7279    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7280                       DAG.getConstant(5, MVT::i32));
7281    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
7282                                TrmpAddr, 5, false, false, 1);
7283
7284    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7285                       DAG.getConstant(6, MVT::i32));
7286    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
7287                                false, false, 1);
7288
7289    SDValue Ops[] =
7290      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
7291    return DAG.getMergeValues(Ops, 2, dl);
7292  }
7293}
7294
7295SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
7296  /*
7297   The rounding mode is in bits 11:10 of FPSR, and has the following
7298   settings:
7299     00 Round to nearest
7300     01 Round to -inf
7301     10 Round to +inf
7302     11 Round to 0
7303
7304  FLT_ROUNDS, on the other hand, expects the following:
7305    -1 Undefined
7306     0 Round to 0
7307     1 Round to nearest
7308     2 Round to +inf
7309     3 Round to -inf
7310
7311  To perform the conversion, we do:
7312    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
7313  */
7314
7315  MachineFunction &MF = DAG.getMachineFunction();
7316  const TargetMachine &TM = MF.getTarget();
7317  const TargetFrameInfo &TFI = *TM.getFrameInfo();
7318  unsigned StackAlignment = TFI.getStackAlignment();
7319  EVT VT = Op.getValueType();
7320  DebugLoc dl = Op.getDebugLoc();
7321
7322  // Save FP Control Word to stack slot
7323  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
7324  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7325
7326  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
7327                              DAG.getEntryNode(), StackSlot);
7328
7329  // Load FP Control Word from stack slot
7330  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
7331                            false, false, 0);
7332
7333  // Transform as necessary
7334  SDValue CWD1 =
7335    DAG.getNode(ISD::SRL, dl, MVT::i16,
7336                DAG.getNode(ISD::AND, dl, MVT::i16,
7337                            CWD, DAG.getConstant(0x800, MVT::i16)),
7338                DAG.getConstant(11, MVT::i8));
7339  SDValue CWD2 =
7340    DAG.getNode(ISD::SRL, dl, MVT::i16,
7341                DAG.getNode(ISD::AND, dl, MVT::i16,
7342                            CWD, DAG.getConstant(0x400, MVT::i16)),
7343                DAG.getConstant(9, MVT::i8));
7344
7345  SDValue RetVal =
7346    DAG.getNode(ISD::AND, dl, MVT::i16,
7347                DAG.getNode(ISD::ADD, dl, MVT::i16,
7348                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
7349                            DAG.getConstant(1, MVT::i16)),
7350                DAG.getConstant(3, MVT::i16));
7351
7352
7353  return DAG.getNode((VT.getSizeInBits() < 16 ?
7354                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
7355}
7356
7357SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
7358  EVT VT = Op.getValueType();
7359  EVT OpVT = VT;
7360  unsigned NumBits = VT.getSizeInBits();
7361  DebugLoc dl = Op.getDebugLoc();
7362
7363  Op = Op.getOperand(0);
7364  if (VT == MVT::i8) {
7365    // Zero extend to i32 since there is not an i8 bsr.
7366    OpVT = MVT::i32;
7367    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7368  }
7369
7370  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
7371  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7372  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
7373
7374  // If src is zero (i.e. bsr sets ZF), returns NumBits.
7375  SDValue Ops[] = {
7376    Op,
7377    DAG.getConstant(NumBits+NumBits-1, OpVT),
7378    DAG.getConstant(X86::COND_E, MVT::i8),
7379    Op.getValue(1)
7380  };
7381  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7382
7383  // Finally xor with NumBits-1.
7384  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
7385
7386  if (VT == MVT::i8)
7387    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7388  return Op;
7389}
7390
7391SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
7392  EVT VT = Op.getValueType();
7393  EVT OpVT = VT;
7394  unsigned NumBits = VT.getSizeInBits();
7395  DebugLoc dl = Op.getDebugLoc();
7396
7397  Op = Op.getOperand(0);
7398  if (VT == MVT::i8) {
7399    OpVT = MVT::i32;
7400    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7401  }
7402
7403  // Issue a bsf (scan bits forward) which also sets EFLAGS.
7404  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7405  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
7406
7407  // If src is zero (i.e. bsf sets ZF), returns NumBits.
7408  SDValue Ops[] = {
7409    Op,
7410    DAG.getConstant(NumBits, OpVT),
7411    DAG.getConstant(X86::COND_E, MVT::i8),
7412    Op.getValue(1)
7413  };
7414  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7415
7416  if (VT == MVT::i8)
7417    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7418  return Op;
7419}
7420
7421SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
7422  EVT VT = Op.getValueType();
7423  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
7424  DebugLoc dl = Op.getDebugLoc();
7425
7426  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
7427  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
7428  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
7429  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
7430  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
7431  //
7432  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
7433  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
7434  //  return AloBlo + AloBhi + AhiBlo;
7435
7436  SDValue A = Op.getOperand(0);
7437  SDValue B = Op.getOperand(1);
7438
7439  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7440                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7441                       A, DAG.getConstant(32, MVT::i32));
7442  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7443                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7444                       B, DAG.getConstant(32, MVT::i32));
7445  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7446                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7447                       A, B);
7448  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7449                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7450                       A, Bhi);
7451  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7452                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7453                       Ahi, B);
7454  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7455                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7456                       AloBhi, DAG.getConstant(32, MVT::i32));
7457  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7458                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7459                       AhiBlo, DAG.getConstant(32, MVT::i32));
7460  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
7461  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
7462  return Res;
7463}
7464
7465
7466SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
7467  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
7468  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
7469  // looks for this combo and may remove the "setcc" instruction if the "setcc"
7470  // has only one use.
7471  SDNode *N = Op.getNode();
7472  SDValue LHS = N->getOperand(0);
7473  SDValue RHS = N->getOperand(1);
7474  unsigned BaseOp = 0;
7475  unsigned Cond = 0;
7476  DebugLoc dl = Op.getDebugLoc();
7477
7478  switch (Op.getOpcode()) {
7479  default: llvm_unreachable("Unknown ovf instruction!");
7480  case ISD::SADDO:
7481    // A subtract of one will be selected as a INC. Note that INC doesn't
7482    // set CF, so we can't do this for UADDO.
7483    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7484      if (C->getAPIntValue() == 1) {
7485        BaseOp = X86ISD::INC;
7486        Cond = X86::COND_O;
7487        break;
7488      }
7489    BaseOp = X86ISD::ADD;
7490    Cond = X86::COND_O;
7491    break;
7492  case ISD::UADDO:
7493    BaseOp = X86ISD::ADD;
7494    Cond = X86::COND_B;
7495    break;
7496  case ISD::SSUBO:
7497    // A subtract of one will be selected as a DEC. Note that DEC doesn't
7498    // set CF, so we can't do this for USUBO.
7499    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7500      if (C->getAPIntValue() == 1) {
7501        BaseOp = X86ISD::DEC;
7502        Cond = X86::COND_O;
7503        break;
7504      }
7505    BaseOp = X86ISD::SUB;
7506    Cond = X86::COND_O;
7507    break;
7508  case ISD::USUBO:
7509    BaseOp = X86ISD::SUB;
7510    Cond = X86::COND_B;
7511    break;
7512  case ISD::SMULO:
7513    BaseOp = X86ISD::SMUL;
7514    Cond = X86::COND_O;
7515    break;
7516  case ISD::UMULO:
7517    BaseOp = X86ISD::UMUL;
7518    Cond = X86::COND_B;
7519    break;
7520  }
7521
7522  // Also sets EFLAGS.
7523  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
7524  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
7525
7526  SDValue SetCC =
7527    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
7528                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
7529
7530  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
7531  return Sum;
7532}
7533
7534SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
7535  EVT T = Op.getValueType();
7536  DebugLoc dl = Op.getDebugLoc();
7537  unsigned Reg = 0;
7538  unsigned size = 0;
7539  switch(T.getSimpleVT().SimpleTy) {
7540  default:
7541    assert(false && "Invalid value type!");
7542  case MVT::i8:  Reg = X86::AL;  size = 1; break;
7543  case MVT::i16: Reg = X86::AX;  size = 2; break;
7544  case MVT::i32: Reg = X86::EAX; size = 4; break;
7545  case MVT::i64:
7546    assert(Subtarget->is64Bit() && "Node not type legal!");
7547    Reg = X86::RAX; size = 8;
7548    break;
7549  }
7550  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
7551                                    Op.getOperand(2), SDValue());
7552  SDValue Ops[] = { cpIn.getValue(0),
7553                    Op.getOperand(1),
7554                    Op.getOperand(3),
7555                    DAG.getTargetConstant(size, MVT::i8),
7556                    cpIn.getValue(1) };
7557  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7558  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
7559  SDValue cpOut =
7560    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
7561  return cpOut;
7562}
7563
7564SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
7565                                                 SelectionDAG &DAG) {
7566  assert(Subtarget->is64Bit() && "Result not type legalized?");
7567  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7568  SDValue TheChain = Op.getOperand(0);
7569  DebugLoc dl = Op.getDebugLoc();
7570  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7571  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
7572  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
7573                                   rax.getValue(2));
7574  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
7575                            DAG.getConstant(32, MVT::i8));
7576  SDValue Ops[] = {
7577    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
7578    rdx.getValue(1)
7579  };
7580  return DAG.getMergeValues(Ops, 2, dl);
7581}
7582
7583SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
7584  SDNode *Node = Op.getNode();
7585  DebugLoc dl = Node->getDebugLoc();
7586  EVT T = Node->getValueType(0);
7587  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
7588                              DAG.getConstant(0, T), Node->getOperand(2));
7589  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
7590                       cast<AtomicSDNode>(Node)->getMemoryVT(),
7591                       Node->getOperand(0),
7592                       Node->getOperand(1), negOp,
7593                       cast<AtomicSDNode>(Node)->getSrcValue(),
7594                       cast<AtomicSDNode>(Node)->getAlignment());
7595}
7596
7597/// LowerOperation - Provide custom lowering hooks for some operations.
7598///
7599SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
7600  switch (Op.getOpcode()) {
7601  default: llvm_unreachable("Should not custom lower this!");
7602  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
7603  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
7604  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
7605  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
7606  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
7607  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7608  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
7609  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
7610  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
7611  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
7612  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
7613  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
7614  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
7615  case ISD::SHL_PARTS:
7616  case ISD::SRA_PARTS:
7617  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
7618  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
7619  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
7620  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
7621  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
7622  case ISD::FABS:               return LowerFABS(Op, DAG);
7623  case ISD::FNEG:               return LowerFNEG(Op, DAG);
7624  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
7625  case ISD::SETCC:              return LowerSETCC(Op, DAG);
7626  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
7627  case ISD::SELECT:             return LowerSELECT(Op, DAG);
7628  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
7629  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
7630  case ISD::VASTART:            return LowerVASTART(Op, DAG);
7631  case ISD::VAARG:              return LowerVAARG(Op, DAG);
7632  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
7633  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7634  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
7635  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
7636  case ISD::FRAME_TO_ARGS_OFFSET:
7637                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
7638  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
7639  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
7640  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
7641  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
7642  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
7643  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
7644  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
7645  case ISD::SADDO:
7646  case ISD::UADDO:
7647  case ISD::SSUBO:
7648  case ISD::USUBO:
7649  case ISD::SMULO:
7650  case ISD::UMULO:              return LowerXALUO(Op, DAG);
7651  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
7652  }
7653}
7654
7655void X86TargetLowering::
7656ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
7657                        SelectionDAG &DAG, unsigned NewOp) {
7658  EVT T = Node->getValueType(0);
7659  DebugLoc dl = Node->getDebugLoc();
7660  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
7661
7662  SDValue Chain = Node->getOperand(0);
7663  SDValue In1 = Node->getOperand(1);
7664  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7665                             Node->getOperand(2), DAG.getIntPtrConstant(0));
7666  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7667                             Node->getOperand(2), DAG.getIntPtrConstant(1));
7668  SDValue Ops[] = { Chain, In1, In2L, In2H };
7669  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
7670  SDValue Result =
7671    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
7672                            cast<MemSDNode>(Node)->getMemOperand());
7673  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
7674  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7675  Results.push_back(Result.getValue(2));
7676}
7677
7678/// ReplaceNodeResults - Replace a node with an illegal result type
7679/// with a new node built out of custom code.
7680void X86TargetLowering::ReplaceNodeResults(SDNode *N,
7681                                           SmallVectorImpl<SDValue>&Results,
7682                                           SelectionDAG &DAG) {
7683  DebugLoc dl = N->getDebugLoc();
7684  switch (N->getOpcode()) {
7685  default:
7686    assert(false && "Do not know how to custom type legalize this operation!");
7687    return;
7688  case ISD::FP_TO_SINT: {
7689    std::pair<SDValue,SDValue> Vals =
7690        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
7691    SDValue FIST = Vals.first, StackSlot = Vals.second;
7692    if (FIST.getNode() != 0) {
7693      EVT VT = N->getValueType(0);
7694      // Return a load from the stack slot.
7695      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
7696                                    false, false, 0));
7697    }
7698    return;
7699  }
7700  case ISD::READCYCLECOUNTER: {
7701    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7702    SDValue TheChain = N->getOperand(0);
7703    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7704    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
7705                                     rd.getValue(1));
7706    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
7707                                     eax.getValue(2));
7708    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
7709    SDValue Ops[] = { eax, edx };
7710    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
7711    Results.push_back(edx.getValue(1));
7712    return;
7713  }
7714  case ISD::ATOMIC_CMP_SWAP: {
7715    EVT T = N->getValueType(0);
7716    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
7717    SDValue cpInL, cpInH;
7718    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7719                        DAG.getConstant(0, MVT::i32));
7720    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7721                        DAG.getConstant(1, MVT::i32));
7722    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
7723    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
7724                             cpInL.getValue(1));
7725    SDValue swapInL, swapInH;
7726    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7727                          DAG.getConstant(0, MVT::i32));
7728    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7729                          DAG.getConstant(1, MVT::i32));
7730    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
7731                               cpInH.getValue(1));
7732    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
7733                               swapInL.getValue(1));
7734    SDValue Ops[] = { swapInH.getValue(0),
7735                      N->getOperand(1),
7736                      swapInH.getValue(1) };
7737    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7738    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
7739    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
7740                                        MVT::i32, Result.getValue(1));
7741    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
7742                                        MVT::i32, cpOutL.getValue(2));
7743    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
7744    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7745    Results.push_back(cpOutH.getValue(1));
7746    return;
7747  }
7748  case ISD::ATOMIC_LOAD_ADD:
7749    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
7750    return;
7751  case ISD::ATOMIC_LOAD_AND:
7752    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
7753    return;
7754  case ISD::ATOMIC_LOAD_NAND:
7755    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
7756    return;
7757  case ISD::ATOMIC_LOAD_OR:
7758    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
7759    return;
7760  case ISD::ATOMIC_LOAD_SUB:
7761    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
7762    return;
7763  case ISD::ATOMIC_LOAD_XOR:
7764    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
7765    return;
7766  case ISD::ATOMIC_SWAP:
7767    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
7768    return;
7769  }
7770}
7771
7772const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7773  switch (Opcode) {
7774  default: return NULL;
7775  case X86ISD::BSF:                return "X86ISD::BSF";
7776  case X86ISD::BSR:                return "X86ISD::BSR";
7777  case X86ISD::SHLD:               return "X86ISD::SHLD";
7778  case X86ISD::SHRD:               return "X86ISD::SHRD";
7779  case X86ISD::FAND:               return "X86ISD::FAND";
7780  case X86ISD::FOR:                return "X86ISD::FOR";
7781  case X86ISD::FXOR:               return "X86ISD::FXOR";
7782  case X86ISD::FSRL:               return "X86ISD::FSRL";
7783  case X86ISD::FILD:               return "X86ISD::FILD";
7784  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
7785  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7786  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7787  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7788  case X86ISD::FLD:                return "X86ISD::FLD";
7789  case X86ISD::FST:                return "X86ISD::FST";
7790  case X86ISD::CALL:               return "X86ISD::CALL";
7791  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
7792  case X86ISD::BT:                 return "X86ISD::BT";
7793  case X86ISD::CMP:                return "X86ISD::CMP";
7794  case X86ISD::COMI:               return "X86ISD::COMI";
7795  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
7796  case X86ISD::SETCC:              return "X86ISD::SETCC";
7797  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
7798  case X86ISD::CMOV:               return "X86ISD::CMOV";
7799  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
7800  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
7801  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
7802  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
7803  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
7804  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
7805  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
7806  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
7807  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
7808  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
7809  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
7810  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
7811  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
7812  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
7813  case X86ISD::FMAX:               return "X86ISD::FMAX";
7814  case X86ISD::FMIN:               return "X86ISD::FMIN";
7815  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
7816  case X86ISD::FRCP:               return "X86ISD::FRCP";
7817  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
7818  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7819  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
7820  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
7821  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
7822  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
7823  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
7824  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
7825  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
7826  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
7827  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
7828  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
7829  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
7830  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
7831  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
7832  case X86ISD::VSHL:               return "X86ISD::VSHL";
7833  case X86ISD::VSRL:               return "X86ISD::VSRL";
7834  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
7835  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
7836  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
7837  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
7838  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
7839  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
7840  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
7841  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
7842  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
7843  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
7844  case X86ISD::ADD:                return "X86ISD::ADD";
7845  case X86ISD::SUB:                return "X86ISD::SUB";
7846  case X86ISD::SMUL:               return "X86ISD::SMUL";
7847  case X86ISD::UMUL:               return "X86ISD::UMUL";
7848  case X86ISD::INC:                return "X86ISD::INC";
7849  case X86ISD::DEC:                return "X86ISD::DEC";
7850  case X86ISD::OR:                 return "X86ISD::OR";
7851  case X86ISD::XOR:                return "X86ISD::XOR";
7852  case X86ISD::AND:                return "X86ISD::AND";
7853  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
7854  case X86ISD::PTEST:              return "X86ISD::PTEST";
7855  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
7856  case X86ISD::MINGW_ALLOCA:       return "X86ISD::MINGW_ALLOCA";
7857  }
7858}
7859
7860// isLegalAddressingMode - Return true if the addressing mode represented
7861// by AM is legal for this target, for a load/store of the specified type.
7862bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7863                                              const Type *Ty) const {
7864  // X86 supports extremely general addressing modes.
7865  CodeModel::Model M = getTargetMachine().getCodeModel();
7866
7867  // X86 allows a sign-extended 32-bit immediate field as a displacement.
7868  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7869    return false;
7870
7871  if (AM.BaseGV) {
7872    unsigned GVFlags =
7873      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7874
7875    // If a reference to this global requires an extra load, we can't fold it.
7876    if (isGlobalStubReference(GVFlags))
7877      return false;
7878
7879    // If BaseGV requires a register for the PIC base, we cannot also have a
7880    // BaseReg specified.
7881    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7882      return false;
7883
7884    // If lower 4G is not available, then we must use rip-relative addressing.
7885    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7886      return false;
7887  }
7888
7889  switch (AM.Scale) {
7890  case 0:
7891  case 1:
7892  case 2:
7893  case 4:
7894  case 8:
7895    // These scales always work.
7896    break;
7897  case 3:
7898  case 5:
7899  case 9:
7900    // These scales are formed with basereg+scalereg.  Only accept if there is
7901    // no basereg yet.
7902    if (AM.HasBaseReg)
7903      return false;
7904    break;
7905  default:  // Other stuff never works.
7906    return false;
7907  }
7908
7909  return true;
7910}
7911
7912
7913bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7914  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
7915    return false;
7916  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7917  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7918  if (NumBits1 <= NumBits2)
7919    return false;
7920  return true;
7921}
7922
7923bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7924  if (!VT1.isInteger() || !VT2.isInteger())
7925    return false;
7926  unsigned NumBits1 = VT1.getSizeInBits();
7927  unsigned NumBits2 = VT2.getSizeInBits();
7928  if (NumBits1 <= NumBits2)
7929    return false;
7930  return true;
7931}
7932
7933bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7934  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7935  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
7936}
7937
7938bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7939  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7940  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7941}
7942
7943bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
7944  // i16 instructions are longer (0x66 prefix) and potentially slower.
7945  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7946}
7947
7948/// isShuffleMaskLegal - Targets can use this to indicate that they only
7949/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7950/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7951/// are assumed to be legal.
7952bool
7953X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7954                                      EVT VT) const {
7955  // Very little shuffling can be done for 64-bit vectors right now.
7956  if (VT.getSizeInBits() == 64)
7957    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
7958
7959  // FIXME: pshufb, blends, shifts.
7960  return (VT.getVectorNumElements() == 2 ||
7961          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7962          isMOVLMask(M, VT) ||
7963          isSHUFPMask(M, VT) ||
7964          isPSHUFDMask(M, VT) ||
7965          isPSHUFHWMask(M, VT) ||
7966          isPSHUFLWMask(M, VT) ||
7967          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
7968          isUNPCKLMask(M, VT) ||
7969          isUNPCKHMask(M, VT) ||
7970          isUNPCKL_v_undef_Mask(M, VT) ||
7971          isUNPCKH_v_undef_Mask(M, VT));
7972}
7973
7974bool
7975X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7976                                          EVT VT) const {
7977  unsigned NumElts = VT.getVectorNumElements();
7978  // FIXME: This collection of masks seems suspect.
7979  if (NumElts == 2)
7980    return true;
7981  if (NumElts == 4 && VT.getSizeInBits() == 128) {
7982    return (isMOVLMask(Mask, VT)  ||
7983            isCommutedMOVLMask(Mask, VT, true) ||
7984            isSHUFPMask(Mask, VT) ||
7985            isCommutedSHUFPMask(Mask, VT));
7986  }
7987  return false;
7988}
7989
7990//===----------------------------------------------------------------------===//
7991//                           X86 Scheduler Hooks
7992//===----------------------------------------------------------------------===//
7993
7994// private utility function
7995MachineBasicBlock *
7996X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7997                                                       MachineBasicBlock *MBB,
7998                                                       unsigned regOpc,
7999                                                       unsigned immOpc,
8000                                                       unsigned LoadOpc,
8001                                                       unsigned CXchgOpc,
8002                                                       unsigned copyOpc,
8003                                                       unsigned notOpc,
8004                                                       unsigned EAXreg,
8005                                                       TargetRegisterClass *RC,
8006                                                       bool invSrc) const {
8007  // For the atomic bitwise operator, we generate
8008  //   thisMBB:
8009  //   newMBB:
8010  //     ld  t1 = [bitinstr.addr]
8011  //     op  t2 = t1, [bitinstr.val]
8012  //     mov EAX = t1
8013  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8014  //     bz  newMBB
8015  //     fallthrough -->nextMBB
8016  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8017  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8018  MachineFunction::iterator MBBIter = MBB;
8019  ++MBBIter;
8020
8021  /// First build the CFG
8022  MachineFunction *F = MBB->getParent();
8023  MachineBasicBlock *thisMBB = MBB;
8024  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8025  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8026  F->insert(MBBIter, newMBB);
8027  F->insert(MBBIter, nextMBB);
8028
8029  // Move all successors to thisMBB to nextMBB
8030  nextMBB->transferSuccessors(thisMBB);
8031
8032  // Update thisMBB to fall through to newMBB
8033  thisMBB->addSuccessor(newMBB);
8034
8035  // newMBB jumps to itself and fall through to nextMBB
8036  newMBB->addSuccessor(nextMBB);
8037  newMBB->addSuccessor(newMBB);
8038
8039  // Insert instructions into newMBB based on incoming instruction
8040  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
8041         "unexpected number of operands");
8042  DebugLoc dl = bInstr->getDebugLoc();
8043  MachineOperand& destOper = bInstr->getOperand(0);
8044  MachineOperand* argOpers[2 + X86AddrNumOperands];
8045  int numArgs = bInstr->getNumOperands() - 1;
8046  for (int i=0; i < numArgs; ++i)
8047    argOpers[i] = &bInstr->getOperand(i+1);
8048
8049  // x86 address has 4 operands: base, index, scale, and displacement
8050  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8051  int valArgIndx = lastAddrIndx + 1;
8052
8053  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8054  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
8055  for (int i=0; i <= lastAddrIndx; ++i)
8056    (*MIB).addOperand(*argOpers[i]);
8057
8058  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
8059  if (invSrc) {
8060    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
8061  }
8062  else
8063    tt = t1;
8064
8065  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8066  assert((argOpers[valArgIndx]->isReg() ||
8067          argOpers[valArgIndx]->isImm()) &&
8068         "invalid operand");
8069  if (argOpers[valArgIndx]->isReg())
8070    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
8071  else
8072    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
8073  MIB.addReg(tt);
8074  (*MIB).addOperand(*argOpers[valArgIndx]);
8075
8076  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
8077  MIB.addReg(t1);
8078
8079  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
8080  for (int i=0; i <= lastAddrIndx; ++i)
8081    (*MIB).addOperand(*argOpers[i]);
8082  MIB.addReg(t2);
8083  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8084  (*MIB).setMemRefs(bInstr->memoperands_begin(),
8085                    bInstr->memoperands_end());
8086
8087  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
8088  MIB.addReg(EAXreg);
8089
8090  // insert branch
8091  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8092
8093  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
8094  return nextMBB;
8095}
8096
8097// private utility function:  64 bit atomics on 32 bit host.
8098MachineBasicBlock *
8099X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
8100                                                       MachineBasicBlock *MBB,
8101                                                       unsigned regOpcL,
8102                                                       unsigned regOpcH,
8103                                                       unsigned immOpcL,
8104                                                       unsigned immOpcH,
8105                                                       bool invSrc) const {
8106  // For the atomic bitwise operator, we generate
8107  //   thisMBB (instructions are in pairs, except cmpxchg8b)
8108  //     ld t1,t2 = [bitinstr.addr]
8109  //   newMBB:
8110  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
8111  //     op  t5, t6 <- out1, out2, [bitinstr.val]
8112  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
8113  //     mov ECX, EBX <- t5, t6
8114  //     mov EAX, EDX <- t1, t2
8115  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
8116  //     mov t3, t4 <- EAX, EDX
8117  //     bz  newMBB
8118  //     result in out1, out2
8119  //     fallthrough -->nextMBB
8120
8121  const TargetRegisterClass *RC = X86::GR32RegisterClass;
8122  const unsigned LoadOpc = X86::MOV32rm;
8123  const unsigned copyOpc = X86::MOV32rr;
8124  const unsigned NotOpc = X86::NOT32r;
8125  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8126  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8127  MachineFunction::iterator MBBIter = MBB;
8128  ++MBBIter;
8129
8130  /// First build the CFG
8131  MachineFunction *F = MBB->getParent();
8132  MachineBasicBlock *thisMBB = MBB;
8133  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8134  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8135  F->insert(MBBIter, newMBB);
8136  F->insert(MBBIter, nextMBB);
8137
8138  // Move all successors to thisMBB to nextMBB
8139  nextMBB->transferSuccessors(thisMBB);
8140
8141  // Update thisMBB to fall through to newMBB
8142  thisMBB->addSuccessor(newMBB);
8143
8144  // newMBB jumps to itself and fall through to nextMBB
8145  newMBB->addSuccessor(nextMBB);
8146  newMBB->addSuccessor(newMBB);
8147
8148  DebugLoc dl = bInstr->getDebugLoc();
8149  // Insert instructions into newMBB based on incoming instruction
8150  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
8151  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
8152         "unexpected number of operands");
8153  MachineOperand& dest1Oper = bInstr->getOperand(0);
8154  MachineOperand& dest2Oper = bInstr->getOperand(1);
8155  MachineOperand* argOpers[2 + X86AddrNumOperands];
8156  for (int i=0; i < 2 + X86AddrNumOperands; ++i)
8157    argOpers[i] = &bInstr->getOperand(i+2);
8158
8159  // x86 address has 5 operands: base, index, scale, displacement, and segment.
8160  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8161
8162  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8163  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
8164  for (int i=0; i <= lastAddrIndx; ++i)
8165    (*MIB).addOperand(*argOpers[i]);
8166  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8167  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
8168  // add 4 to displacement.
8169  for (int i=0; i <= lastAddrIndx-2; ++i)
8170    (*MIB).addOperand(*argOpers[i]);
8171  MachineOperand newOp3 = *(argOpers[3]);
8172  if (newOp3.isImm())
8173    newOp3.setImm(newOp3.getImm()+4);
8174  else
8175    newOp3.setOffset(newOp3.getOffset()+4);
8176  (*MIB).addOperand(newOp3);
8177  (*MIB).addOperand(*argOpers[lastAddrIndx]);
8178
8179  // t3/4 are defined later, at the bottom of the loop
8180  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
8181  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
8182  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
8183    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
8184  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
8185    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
8186
8187  // The subsequent operations should be using the destination registers of
8188  //the PHI instructions.
8189  if (invSrc) {
8190    t1 = F->getRegInfo().createVirtualRegister(RC);
8191    t2 = F->getRegInfo().createVirtualRegister(RC);
8192    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
8193    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
8194  } else {
8195    t1 = dest1Oper.getReg();
8196    t2 = dest2Oper.getReg();
8197  }
8198
8199  int valArgIndx = lastAddrIndx + 1;
8200  assert((argOpers[valArgIndx]->isReg() ||
8201          argOpers[valArgIndx]->isImm()) &&
8202         "invalid operand");
8203  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
8204  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
8205  if (argOpers[valArgIndx]->isReg())
8206    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
8207  else
8208    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
8209  if (regOpcL != X86::MOV32rr)
8210    MIB.addReg(t1);
8211  (*MIB).addOperand(*argOpers[valArgIndx]);
8212  assert(argOpers[valArgIndx + 1]->isReg() ==
8213         argOpers[valArgIndx]->isReg());
8214  assert(argOpers[valArgIndx + 1]->isImm() ==
8215         argOpers[valArgIndx]->isImm());
8216  if (argOpers[valArgIndx + 1]->isReg())
8217    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
8218  else
8219    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
8220  if (regOpcH != X86::MOV32rr)
8221    MIB.addReg(t2);
8222  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
8223
8224  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
8225  MIB.addReg(t1);
8226  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
8227  MIB.addReg(t2);
8228
8229  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
8230  MIB.addReg(t5);
8231  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
8232  MIB.addReg(t6);
8233
8234  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
8235  for (int i=0; i <= lastAddrIndx; ++i)
8236    (*MIB).addOperand(*argOpers[i]);
8237
8238  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8239  (*MIB).setMemRefs(bInstr->memoperands_begin(),
8240                    bInstr->memoperands_end());
8241
8242  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
8243  MIB.addReg(X86::EAX);
8244  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
8245  MIB.addReg(X86::EDX);
8246
8247  // insert branch
8248  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8249
8250  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
8251  return nextMBB;
8252}
8253
8254// private utility function
8255MachineBasicBlock *
8256X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
8257                                                      MachineBasicBlock *MBB,
8258                                                      unsigned cmovOpc) const {
8259  // For the atomic min/max operator, we generate
8260  //   thisMBB:
8261  //   newMBB:
8262  //     ld t1 = [min/max.addr]
8263  //     mov t2 = [min/max.val]
8264  //     cmp  t1, t2
8265  //     cmov[cond] t2 = t1
8266  //     mov EAX = t1
8267  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8268  //     bz   newMBB
8269  //     fallthrough -->nextMBB
8270  //
8271  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8272  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8273  MachineFunction::iterator MBBIter = MBB;
8274  ++MBBIter;
8275
8276  /// First build the CFG
8277  MachineFunction *F = MBB->getParent();
8278  MachineBasicBlock *thisMBB = MBB;
8279  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8280  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8281  F->insert(MBBIter, newMBB);
8282  F->insert(MBBIter, nextMBB);
8283
8284  // Move all successors of thisMBB to nextMBB
8285  nextMBB->transferSuccessors(thisMBB);
8286
8287  // Update thisMBB to fall through to newMBB
8288  thisMBB->addSuccessor(newMBB);
8289
8290  // newMBB jumps to newMBB and fall through to nextMBB
8291  newMBB->addSuccessor(nextMBB);
8292  newMBB->addSuccessor(newMBB);
8293
8294  DebugLoc dl = mInstr->getDebugLoc();
8295  // Insert instructions into newMBB based on incoming instruction
8296  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
8297         "unexpected number of operands");
8298  MachineOperand& destOper = mInstr->getOperand(0);
8299  MachineOperand* argOpers[2 + X86AddrNumOperands];
8300  int numArgs = mInstr->getNumOperands() - 1;
8301  for (int i=0; i < numArgs; ++i)
8302    argOpers[i] = &mInstr->getOperand(i+1);
8303
8304  // x86 address has 4 operands: base, index, scale, and displacement
8305  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8306  int valArgIndx = lastAddrIndx + 1;
8307
8308  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8309  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
8310  for (int i=0; i <= lastAddrIndx; ++i)
8311    (*MIB).addOperand(*argOpers[i]);
8312
8313  // We only support register and immediate values
8314  assert((argOpers[valArgIndx]->isReg() ||
8315          argOpers[valArgIndx]->isImm()) &&
8316         "invalid operand");
8317
8318  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8319  if (argOpers[valArgIndx]->isReg())
8320    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8321  else
8322    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8323  (*MIB).addOperand(*argOpers[valArgIndx]);
8324
8325  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
8326  MIB.addReg(t1);
8327
8328  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
8329  MIB.addReg(t1);
8330  MIB.addReg(t2);
8331
8332  // Generate movc
8333  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8334  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
8335  MIB.addReg(t2);
8336  MIB.addReg(t1);
8337
8338  // Cmp and exchange if none has modified the memory location
8339  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
8340  for (int i=0; i <= lastAddrIndx; ++i)
8341    (*MIB).addOperand(*argOpers[i]);
8342  MIB.addReg(t3);
8343  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8344  (*MIB).setMemRefs(mInstr->memoperands_begin(),
8345                    mInstr->memoperands_end());
8346
8347  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
8348  MIB.addReg(X86::EAX);
8349
8350  // insert branch
8351  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8352
8353  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
8354  return nextMBB;
8355}
8356
8357// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
8358// all of this code can be replaced with that in the .td file.
8359MachineBasicBlock *
8360X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
8361                            unsigned numArgs, bool memArg) const {
8362
8363  MachineFunction *F = BB->getParent();
8364  DebugLoc dl = MI->getDebugLoc();
8365  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8366
8367  unsigned Opc;
8368  if (memArg)
8369    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
8370  else
8371    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
8372
8373  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
8374
8375  for (unsigned i = 0; i < numArgs; ++i) {
8376    MachineOperand &Op = MI->getOperand(i+1);
8377
8378    if (!(Op.isReg() && Op.isImplicit()))
8379      MIB.addOperand(Op);
8380  }
8381
8382  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
8383    .addReg(X86::XMM0);
8384
8385  F->DeleteMachineInstr(MI);
8386
8387  return BB;
8388}
8389
8390MachineBasicBlock *
8391X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
8392                                                 MachineInstr *MI,
8393                                                 MachineBasicBlock *MBB) const {
8394  // Emit code to save XMM registers to the stack. The ABI says that the
8395  // number of registers to save is given in %al, so it's theoretically
8396  // possible to do an indirect jump trick to avoid saving all of them,
8397  // however this code takes a simpler approach and just executes all
8398  // of the stores if %al is non-zero. It's less code, and it's probably
8399  // easier on the hardware branch predictor, and stores aren't all that
8400  // expensive anyway.
8401
8402  // Create the new basic blocks. One block contains all the XMM stores,
8403  // and one block is the final destination regardless of whether any
8404  // stores were performed.
8405  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8406  MachineFunction *F = MBB->getParent();
8407  MachineFunction::iterator MBBIter = MBB;
8408  ++MBBIter;
8409  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
8410  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
8411  F->insert(MBBIter, XMMSaveMBB);
8412  F->insert(MBBIter, EndMBB);
8413
8414  // Set up the CFG.
8415  // Move any original successors of MBB to the end block.
8416  EndMBB->transferSuccessors(MBB);
8417  // The original block will now fall through to the XMM save block.
8418  MBB->addSuccessor(XMMSaveMBB);
8419  // The XMMSaveMBB will fall through to the end block.
8420  XMMSaveMBB->addSuccessor(EndMBB);
8421
8422  // Now add the instructions.
8423  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8424  DebugLoc DL = MI->getDebugLoc();
8425
8426  unsigned CountReg = MI->getOperand(0).getReg();
8427  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
8428  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
8429
8430  if (!Subtarget->isTargetWin64()) {
8431    // If %al is 0, branch around the XMM save block.
8432    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
8433    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
8434    MBB->addSuccessor(EndMBB);
8435  }
8436
8437  // In the XMM save block, save all the XMM argument registers.
8438  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
8439    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
8440    MachineMemOperand *MMO =
8441      F->getMachineMemOperand(
8442        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
8443        MachineMemOperand::MOStore, Offset,
8444        /*Size=*/16, /*Align=*/16);
8445    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
8446      .addFrameIndex(RegSaveFrameIndex)
8447      .addImm(/*Scale=*/1)
8448      .addReg(/*IndexReg=*/0)
8449      .addImm(/*Disp=*/Offset)
8450      .addReg(/*Segment=*/0)
8451      .addReg(MI->getOperand(i).getReg())
8452      .addMemOperand(MMO);
8453  }
8454
8455  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8456
8457  return EndMBB;
8458}
8459
8460MachineBasicBlock *
8461X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
8462                                     MachineBasicBlock *BB,
8463                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8464  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8465  DebugLoc DL = MI->getDebugLoc();
8466
8467  // To "insert" a SELECT_CC instruction, we actually have to insert the
8468  // diamond control-flow pattern.  The incoming instruction knows the
8469  // destination vreg to set, the condition code register to branch on, the
8470  // true/false values to select between, and a branch opcode to use.
8471  const BasicBlock *LLVM_BB = BB->getBasicBlock();
8472  MachineFunction::iterator It = BB;
8473  ++It;
8474
8475  //  thisMBB:
8476  //  ...
8477  //   TrueVal = ...
8478  //   cmpTY ccX, r1, r2
8479  //   bCC copy1MBB
8480  //   fallthrough --> copy0MBB
8481  MachineBasicBlock *thisMBB = BB;
8482  MachineFunction *F = BB->getParent();
8483  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8484  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
8485  unsigned Opc =
8486    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
8487  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
8488  F->insert(It, copy0MBB);
8489  F->insert(It, sinkMBB);
8490  // Update machine-CFG edges by first adding all successors of the current
8491  // block to the new block which will contain the Phi node for the select.
8492  // Also inform sdisel of the edge changes.
8493  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
8494         E = BB->succ_end(); I != E; ++I) {
8495    EM->insert(std::make_pair(*I, sinkMBB));
8496    sinkMBB->addSuccessor(*I);
8497  }
8498  // Next, remove all successors of the current block, and add the true
8499  // and fallthrough blocks as its successors.
8500  while (!BB->succ_empty())
8501    BB->removeSuccessor(BB->succ_begin());
8502  // Add the true and fallthrough blocks as its successors.
8503  BB->addSuccessor(copy0MBB);
8504  BB->addSuccessor(sinkMBB);
8505
8506  //  copy0MBB:
8507  //   %FalseValue = ...
8508  //   # fallthrough to sinkMBB
8509  BB = copy0MBB;
8510
8511  // Update machine-CFG edges
8512  BB->addSuccessor(sinkMBB);
8513
8514  //  sinkMBB:
8515  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8516  //  ...
8517  BB = sinkMBB;
8518  BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
8519    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
8520    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
8521
8522  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8523  return BB;
8524}
8525
8526MachineBasicBlock *
8527X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
8528                                          MachineBasicBlock *BB,
8529                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8530  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8531  DebugLoc DL = MI->getDebugLoc();
8532  MachineFunction *F = BB->getParent();
8533
8534  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
8535  // non-trivial part is impdef of ESP.
8536  // FIXME: The code should be tweaked as soon as we'll try to do codegen for
8537  // mingw-w64.
8538
8539  BuildMI(BB, DL, TII->get(X86::CALLpcrel32))
8540    .addExternalSymbol("_alloca")
8541    .addReg(X86::EAX, RegState::Implicit)
8542    .addReg(X86::ESP, RegState::Implicit)
8543    .addReg(X86::EAX, RegState::Define | RegState::Implicit)
8544    .addReg(X86::ESP, RegState::Define | RegState::Implicit);
8545
8546  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8547  return BB;
8548}
8549
8550MachineBasicBlock *
8551X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
8552                                               MachineBasicBlock *BB,
8553                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8554  switch (MI->getOpcode()) {
8555  default: assert(false && "Unexpected instr type to insert");
8556  case X86::MINGW_ALLOCA:
8557    return EmitLoweredMingwAlloca(MI, BB, EM);
8558  case X86::CMOV_GR8:
8559  case X86::CMOV_V1I64:
8560  case X86::CMOV_FR32:
8561  case X86::CMOV_FR64:
8562  case X86::CMOV_V4F32:
8563  case X86::CMOV_V2F64:
8564  case X86::CMOV_V2I64:
8565  case X86::CMOV_GR16:
8566  case X86::CMOV_GR32:
8567  case X86::CMOV_RFP32:
8568  case X86::CMOV_RFP64:
8569  case X86::CMOV_RFP80:
8570    return EmitLoweredSelect(MI, BB, EM);
8571
8572  case X86::FP32_TO_INT16_IN_MEM:
8573  case X86::FP32_TO_INT32_IN_MEM:
8574  case X86::FP32_TO_INT64_IN_MEM:
8575  case X86::FP64_TO_INT16_IN_MEM:
8576  case X86::FP64_TO_INT32_IN_MEM:
8577  case X86::FP64_TO_INT64_IN_MEM:
8578  case X86::FP80_TO_INT16_IN_MEM:
8579  case X86::FP80_TO_INT32_IN_MEM:
8580  case X86::FP80_TO_INT64_IN_MEM: {
8581    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8582    DebugLoc DL = MI->getDebugLoc();
8583
8584    // Change the floating point control register to use "round towards zero"
8585    // mode when truncating to an integer value.
8586    MachineFunction *F = BB->getParent();
8587    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
8588    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
8589
8590    // Load the old value of the high byte of the control word...
8591    unsigned OldCW =
8592      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
8593    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
8594                      CWFrameIdx);
8595
8596    // Set the high part to be round to zero...
8597    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
8598      .addImm(0xC7F);
8599
8600    // Reload the modified control word now...
8601    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8602
8603    // Restore the memory image of control word to original value
8604    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
8605      .addReg(OldCW);
8606
8607    // Get the X86 opcode to use.
8608    unsigned Opc;
8609    switch (MI->getOpcode()) {
8610    default: llvm_unreachable("illegal opcode!");
8611    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
8612    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
8613    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
8614    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
8615    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
8616    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
8617    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
8618    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
8619    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
8620    }
8621
8622    X86AddressMode AM;
8623    MachineOperand &Op = MI->getOperand(0);
8624    if (Op.isReg()) {
8625      AM.BaseType = X86AddressMode::RegBase;
8626      AM.Base.Reg = Op.getReg();
8627    } else {
8628      AM.BaseType = X86AddressMode::FrameIndexBase;
8629      AM.Base.FrameIndex = Op.getIndex();
8630    }
8631    Op = MI->getOperand(1);
8632    if (Op.isImm())
8633      AM.Scale = Op.getImm();
8634    Op = MI->getOperand(2);
8635    if (Op.isImm())
8636      AM.IndexReg = Op.getImm();
8637    Op = MI->getOperand(3);
8638    if (Op.isGlobal()) {
8639      AM.GV = Op.getGlobal();
8640    } else {
8641      AM.Disp = Op.getImm();
8642    }
8643    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
8644                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
8645
8646    // Reload the original control word now.
8647    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8648
8649    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8650    return BB;
8651  }
8652    // DBG_VALUE.  Only the frame index case is done here.
8653  case X86::DBG_VALUE: {
8654    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8655    DebugLoc DL = MI->getDebugLoc();
8656    X86AddressMode AM;
8657    MachineFunction *F = BB->getParent();
8658    AM.BaseType = X86AddressMode::FrameIndexBase;
8659    AM.Base.FrameIndex = MI->getOperand(0).getImm();
8660    addFullAddress(BuildMI(BB, DL, TII->get(X86::DBG_VALUE)), AM).
8661      addImm(MI->getOperand(1).getImm()).
8662      addMetadata(MI->getOperand(2).getMetadata());
8663    F->DeleteMachineInstr(MI);      // Remove pseudo.
8664    return BB;
8665  }
8666
8667    // String/text processing lowering.
8668  case X86::PCMPISTRM128REG:
8669    return EmitPCMP(MI, BB, 3, false /* in-mem */);
8670  case X86::PCMPISTRM128MEM:
8671    return EmitPCMP(MI, BB, 3, true /* in-mem */);
8672  case X86::PCMPESTRM128REG:
8673    return EmitPCMP(MI, BB, 5, false /* in mem */);
8674  case X86::PCMPESTRM128MEM:
8675    return EmitPCMP(MI, BB, 5, true /* in mem */);
8676
8677    // Atomic Lowering.
8678  case X86::ATOMAND32:
8679    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8680                                               X86::AND32ri, X86::MOV32rm,
8681                                               X86::LCMPXCHG32, X86::MOV32rr,
8682                                               X86::NOT32r, X86::EAX,
8683                                               X86::GR32RegisterClass);
8684  case X86::ATOMOR32:
8685    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
8686                                               X86::OR32ri, X86::MOV32rm,
8687                                               X86::LCMPXCHG32, X86::MOV32rr,
8688                                               X86::NOT32r, X86::EAX,
8689                                               X86::GR32RegisterClass);
8690  case X86::ATOMXOR32:
8691    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
8692                                               X86::XOR32ri, X86::MOV32rm,
8693                                               X86::LCMPXCHG32, X86::MOV32rr,
8694                                               X86::NOT32r, X86::EAX,
8695                                               X86::GR32RegisterClass);
8696  case X86::ATOMNAND32:
8697    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8698                                               X86::AND32ri, X86::MOV32rm,
8699                                               X86::LCMPXCHG32, X86::MOV32rr,
8700                                               X86::NOT32r, X86::EAX,
8701                                               X86::GR32RegisterClass, true);
8702  case X86::ATOMMIN32:
8703    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
8704  case X86::ATOMMAX32:
8705    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
8706  case X86::ATOMUMIN32:
8707    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
8708  case X86::ATOMUMAX32:
8709    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
8710
8711  case X86::ATOMAND16:
8712    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8713                                               X86::AND16ri, X86::MOV16rm,
8714                                               X86::LCMPXCHG16, X86::MOV16rr,
8715                                               X86::NOT16r, X86::AX,
8716                                               X86::GR16RegisterClass);
8717  case X86::ATOMOR16:
8718    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
8719                                               X86::OR16ri, X86::MOV16rm,
8720                                               X86::LCMPXCHG16, X86::MOV16rr,
8721                                               X86::NOT16r, X86::AX,
8722                                               X86::GR16RegisterClass);
8723  case X86::ATOMXOR16:
8724    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
8725                                               X86::XOR16ri, X86::MOV16rm,
8726                                               X86::LCMPXCHG16, X86::MOV16rr,
8727                                               X86::NOT16r, X86::AX,
8728                                               X86::GR16RegisterClass);
8729  case X86::ATOMNAND16:
8730    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8731                                               X86::AND16ri, X86::MOV16rm,
8732                                               X86::LCMPXCHG16, X86::MOV16rr,
8733                                               X86::NOT16r, X86::AX,
8734                                               X86::GR16RegisterClass, true);
8735  case X86::ATOMMIN16:
8736    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
8737  case X86::ATOMMAX16:
8738    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
8739  case X86::ATOMUMIN16:
8740    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
8741  case X86::ATOMUMAX16:
8742    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
8743
8744  case X86::ATOMAND8:
8745    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8746                                               X86::AND8ri, X86::MOV8rm,
8747                                               X86::LCMPXCHG8, X86::MOV8rr,
8748                                               X86::NOT8r, X86::AL,
8749                                               X86::GR8RegisterClass);
8750  case X86::ATOMOR8:
8751    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
8752                                               X86::OR8ri, X86::MOV8rm,
8753                                               X86::LCMPXCHG8, X86::MOV8rr,
8754                                               X86::NOT8r, X86::AL,
8755                                               X86::GR8RegisterClass);
8756  case X86::ATOMXOR8:
8757    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
8758                                               X86::XOR8ri, X86::MOV8rm,
8759                                               X86::LCMPXCHG8, X86::MOV8rr,
8760                                               X86::NOT8r, X86::AL,
8761                                               X86::GR8RegisterClass);
8762  case X86::ATOMNAND8:
8763    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8764                                               X86::AND8ri, X86::MOV8rm,
8765                                               X86::LCMPXCHG8, X86::MOV8rr,
8766                                               X86::NOT8r, X86::AL,
8767                                               X86::GR8RegisterClass, true);
8768  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
8769  // This group is for 64-bit host.
8770  case X86::ATOMAND64:
8771    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8772                                               X86::AND64ri32, X86::MOV64rm,
8773                                               X86::LCMPXCHG64, X86::MOV64rr,
8774                                               X86::NOT64r, X86::RAX,
8775                                               X86::GR64RegisterClass);
8776  case X86::ATOMOR64:
8777    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
8778                                               X86::OR64ri32, X86::MOV64rm,
8779                                               X86::LCMPXCHG64, X86::MOV64rr,
8780                                               X86::NOT64r, X86::RAX,
8781                                               X86::GR64RegisterClass);
8782  case X86::ATOMXOR64:
8783    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
8784                                               X86::XOR64ri32, X86::MOV64rm,
8785                                               X86::LCMPXCHG64, X86::MOV64rr,
8786                                               X86::NOT64r, X86::RAX,
8787                                               X86::GR64RegisterClass);
8788  case X86::ATOMNAND64:
8789    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8790                                               X86::AND64ri32, X86::MOV64rm,
8791                                               X86::LCMPXCHG64, X86::MOV64rr,
8792                                               X86::NOT64r, X86::RAX,
8793                                               X86::GR64RegisterClass, true);
8794  case X86::ATOMMIN64:
8795    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
8796  case X86::ATOMMAX64:
8797    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
8798  case X86::ATOMUMIN64:
8799    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
8800  case X86::ATOMUMAX64:
8801    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
8802
8803  // This group does 64-bit operations on a 32-bit host.
8804  case X86::ATOMAND6432:
8805    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8806                                               X86::AND32rr, X86::AND32rr,
8807                                               X86::AND32ri, X86::AND32ri,
8808                                               false);
8809  case X86::ATOMOR6432:
8810    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8811                                               X86::OR32rr, X86::OR32rr,
8812                                               X86::OR32ri, X86::OR32ri,
8813                                               false);
8814  case X86::ATOMXOR6432:
8815    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8816                                               X86::XOR32rr, X86::XOR32rr,
8817                                               X86::XOR32ri, X86::XOR32ri,
8818                                               false);
8819  case X86::ATOMNAND6432:
8820    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8821                                               X86::AND32rr, X86::AND32rr,
8822                                               X86::AND32ri, X86::AND32ri,
8823                                               true);
8824  case X86::ATOMADD6432:
8825    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8826                                               X86::ADD32rr, X86::ADC32rr,
8827                                               X86::ADD32ri, X86::ADC32ri,
8828                                               false);
8829  case X86::ATOMSUB6432:
8830    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8831                                               X86::SUB32rr, X86::SBB32rr,
8832                                               X86::SUB32ri, X86::SBB32ri,
8833                                               false);
8834  case X86::ATOMSWAP6432:
8835    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8836                                               X86::MOV32rr, X86::MOV32rr,
8837                                               X86::MOV32ri, X86::MOV32ri,
8838                                               false);
8839  case X86::VASTART_SAVE_XMM_REGS:
8840    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
8841  }
8842}
8843
8844//===----------------------------------------------------------------------===//
8845//                           X86 Optimization Hooks
8846//===----------------------------------------------------------------------===//
8847
8848void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
8849                                                       const APInt &Mask,
8850                                                       APInt &KnownZero,
8851                                                       APInt &KnownOne,
8852                                                       const SelectionDAG &DAG,
8853                                                       unsigned Depth) const {
8854  unsigned Opc = Op.getOpcode();
8855  assert((Opc >= ISD::BUILTIN_OP_END ||
8856          Opc == ISD::INTRINSIC_WO_CHAIN ||
8857          Opc == ISD::INTRINSIC_W_CHAIN ||
8858          Opc == ISD::INTRINSIC_VOID) &&
8859         "Should use MaskedValueIsZero if you don't know whether Op"
8860         " is a target node!");
8861
8862  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
8863  switch (Opc) {
8864  default: break;
8865  case X86ISD::ADD:
8866  case X86ISD::SUB:
8867  case X86ISD::SMUL:
8868  case X86ISD::UMUL:
8869  case X86ISD::INC:
8870  case X86ISD::DEC:
8871  case X86ISD::OR:
8872  case X86ISD::XOR:
8873  case X86ISD::AND:
8874    // These nodes' second result is a boolean.
8875    if (Op.getResNo() == 0)
8876      break;
8877    // Fallthrough
8878  case X86ISD::SETCC:
8879    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
8880                                       Mask.getBitWidth() - 1);
8881    break;
8882  }
8883}
8884
8885/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
8886/// node is a GlobalAddress + offset.
8887bool X86TargetLowering::isGAPlusOffset(SDNode *N,
8888                                       const GlobalValue* &GA,
8889                                       int64_t &Offset) const {
8890  if (N->getOpcode() == X86ISD::Wrapper) {
8891    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
8892      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
8893      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
8894      return true;
8895    }
8896  }
8897  return TargetLowering::isGAPlusOffset(N, GA, Offset);
8898}
8899
8900/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
8901/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
8902/// if the load addresses are consecutive, non-overlapping, and in the right
8903/// order.
8904static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
8905                                     const TargetLowering &TLI) {
8906  DebugLoc dl = N->getDebugLoc();
8907  EVT VT = N->getValueType(0);
8908  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
8909
8910  if (VT.getSizeInBits() != 128)
8911    return SDValue();
8912
8913  SmallVector<SDValue, 16> Elts;
8914  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
8915    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
8916
8917  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
8918}
8919
8920/// PerformShuffleCombine - Detect vector gather/scatter index generation
8921/// and convert it from being a bunch of shuffles and extracts to a simple
8922/// store and scalar loads to extract the elements.
8923static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
8924                                                const TargetLowering &TLI) {
8925  SDValue InputVector = N->getOperand(0);
8926
8927  // Only operate on vectors of 4 elements, where the alternative shuffling
8928  // gets to be more expensive.
8929  if (InputVector.getValueType() != MVT::v4i32)
8930    return SDValue();
8931
8932  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
8933  // single use which is a sign-extend or zero-extend, and all elements are
8934  // used.
8935  SmallVector<SDNode *, 4> Uses;
8936  unsigned ExtractedElements = 0;
8937  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
8938       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
8939    if (UI.getUse().getResNo() != InputVector.getResNo())
8940      return SDValue();
8941
8942    SDNode *Extract = *UI;
8943    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8944      return SDValue();
8945
8946    if (Extract->getValueType(0) != MVT::i32)
8947      return SDValue();
8948    if (!Extract->hasOneUse())
8949      return SDValue();
8950    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
8951        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
8952      return SDValue();
8953    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
8954      return SDValue();
8955
8956    // Record which element was extracted.
8957    ExtractedElements |=
8958      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
8959
8960    Uses.push_back(Extract);
8961  }
8962
8963  // If not all the elements were used, this may not be worthwhile.
8964  if (ExtractedElements != 15)
8965    return SDValue();
8966
8967  // Ok, we've now decided to do the transformation.
8968  DebugLoc dl = InputVector.getDebugLoc();
8969
8970  // Store the value to a temporary stack slot.
8971  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
8972  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0,
8973                            false, false, 0);
8974
8975  // Replace each use (extract) with a load of the appropriate element.
8976  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
8977       UE = Uses.end(); UI != UE; ++UI) {
8978    SDNode *Extract = *UI;
8979
8980    // Compute the element's address.
8981    SDValue Idx = Extract->getOperand(1);
8982    unsigned EltSize =
8983        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
8984    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
8985    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
8986
8987    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr);
8988
8989    // Load the scalar.
8990    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr,
8991                          NULL, 0, false, false, 0);
8992
8993    // Replace the exact with the load.
8994    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
8995  }
8996
8997  // The replacement was made in place; don't return anything.
8998  return SDValue();
8999}
9000
9001/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
9002static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
9003                                    const X86Subtarget *Subtarget) {
9004  DebugLoc DL = N->getDebugLoc();
9005  SDValue Cond = N->getOperand(0);
9006  // Get the LHS/RHS of the select.
9007  SDValue LHS = N->getOperand(1);
9008  SDValue RHS = N->getOperand(2);
9009
9010  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
9011  // instructions match the semantics of the common C idiom x<y?x:y but not
9012  // x<=y?x:y, because of how they handle negative zero (which can be
9013  // ignored in unsafe-math mode).
9014  if (Subtarget->hasSSE2() &&
9015      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
9016      Cond.getOpcode() == ISD::SETCC) {
9017    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
9018
9019    unsigned Opcode = 0;
9020    // Check for x CC y ? x : y.
9021    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
9022        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
9023      switch (CC) {
9024      default: break;
9025      case ISD::SETULT:
9026        // Converting this to a min would handle NaNs incorrectly, and swapping
9027        // the operands would cause it to handle comparisons between positive
9028        // and negative zero incorrectly.
9029        if (!FiniteOnlyFPMath() &&
9030            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
9031          if (!UnsafeFPMath &&
9032              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9033            break;
9034          std::swap(LHS, RHS);
9035        }
9036        Opcode = X86ISD::FMIN;
9037        break;
9038      case ISD::SETOLE:
9039        // Converting this to a min would handle comparisons between positive
9040        // and negative zero incorrectly.
9041        if (!UnsafeFPMath &&
9042            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
9043          break;
9044        Opcode = X86ISD::FMIN;
9045        break;
9046      case ISD::SETULE:
9047        // Converting this to a min would handle both negative zeros and NaNs
9048        // incorrectly, but we can swap the operands to fix both.
9049        std::swap(LHS, RHS);
9050      case ISD::SETOLT:
9051      case ISD::SETLT:
9052      case ISD::SETLE:
9053        Opcode = X86ISD::FMIN;
9054        break;
9055
9056      case ISD::SETOGE:
9057        // Converting this to a max would handle comparisons between positive
9058        // and negative zero incorrectly.
9059        if (!UnsafeFPMath &&
9060            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
9061          break;
9062        Opcode = X86ISD::FMAX;
9063        break;
9064      case ISD::SETUGT:
9065        // Converting this to a max would handle NaNs incorrectly, and swapping
9066        // the operands would cause it to handle comparisons between positive
9067        // and negative zero incorrectly.
9068        if (!FiniteOnlyFPMath() &&
9069            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
9070          if (!UnsafeFPMath &&
9071              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9072            break;
9073          std::swap(LHS, RHS);
9074        }
9075        Opcode = X86ISD::FMAX;
9076        break;
9077      case ISD::SETUGE:
9078        // Converting this to a max would handle both negative zeros and NaNs
9079        // incorrectly, but we can swap the operands to fix both.
9080        std::swap(LHS, RHS);
9081      case ISD::SETOGT:
9082      case ISD::SETGT:
9083      case ISD::SETGE:
9084        Opcode = X86ISD::FMAX;
9085        break;
9086      }
9087    // Check for x CC y ? y : x -- a min/max with reversed arms.
9088    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
9089               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
9090      switch (CC) {
9091      default: break;
9092      case ISD::SETOGE:
9093        // Converting this to a min would handle comparisons between positive
9094        // and negative zero incorrectly, and swapping the operands would
9095        // cause it to handle NaNs incorrectly.
9096        if (!UnsafeFPMath &&
9097            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
9098          if (!FiniteOnlyFPMath() &&
9099              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9100            break;
9101          std::swap(LHS, RHS);
9102        }
9103        Opcode = X86ISD::FMIN;
9104        break;
9105      case ISD::SETUGT:
9106        // Converting this to a min would handle NaNs incorrectly.
9107        if (!UnsafeFPMath &&
9108            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9109          break;
9110        Opcode = X86ISD::FMIN;
9111        break;
9112      case ISD::SETUGE:
9113        // Converting this to a min would handle both negative zeros and NaNs
9114        // incorrectly, but we can swap the operands to fix both.
9115        std::swap(LHS, RHS);
9116      case ISD::SETOGT:
9117      case ISD::SETGT:
9118      case ISD::SETGE:
9119        Opcode = X86ISD::FMIN;
9120        break;
9121
9122      case ISD::SETULT:
9123        // Converting this to a max would handle NaNs incorrectly.
9124        if (!FiniteOnlyFPMath() &&
9125            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9126          break;
9127        Opcode = X86ISD::FMAX;
9128        break;
9129      case ISD::SETOLE:
9130        // Converting this to a max would handle comparisons between positive
9131        // and negative zero incorrectly, and swapping the operands would
9132        // cause it to handle NaNs incorrectly.
9133        if (!UnsafeFPMath &&
9134            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
9135          if (!FiniteOnlyFPMath() &&
9136              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9137            break;
9138          std::swap(LHS, RHS);
9139        }
9140        Opcode = X86ISD::FMAX;
9141        break;
9142      case ISD::SETULE:
9143        // Converting this to a max would handle both negative zeros and NaNs
9144        // incorrectly, but we can swap the operands to fix both.
9145        std::swap(LHS, RHS);
9146      case ISD::SETOLT:
9147      case ISD::SETLT:
9148      case ISD::SETLE:
9149        Opcode = X86ISD::FMAX;
9150        break;
9151      }
9152    }
9153
9154    if (Opcode)
9155      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
9156  }
9157
9158  // If this is a select between two integer constants, try to do some
9159  // optimizations.
9160  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
9161    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
9162      // Don't do this for crazy integer types.
9163      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
9164        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
9165        // so that TrueC (the true value) is larger than FalseC.
9166        bool NeedsCondInvert = false;
9167
9168        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
9169            // Efficiently invertible.
9170            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
9171             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
9172              isa<ConstantSDNode>(Cond.getOperand(1))))) {
9173          NeedsCondInvert = true;
9174          std::swap(TrueC, FalseC);
9175        }
9176
9177        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
9178        if (FalseC->getAPIntValue() == 0 &&
9179            TrueC->getAPIntValue().isPowerOf2()) {
9180          if (NeedsCondInvert) // Invert the condition if needed.
9181            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9182                               DAG.getConstant(1, Cond.getValueType()));
9183
9184          // Zero extend the condition if needed.
9185          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
9186
9187          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9188          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
9189                             DAG.getConstant(ShAmt, MVT::i8));
9190        }
9191
9192        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
9193        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9194          if (NeedsCondInvert) // Invert the condition if needed.
9195            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9196                               DAG.getConstant(1, Cond.getValueType()));
9197
9198          // Zero extend the condition if needed.
9199          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9200                             FalseC->getValueType(0), Cond);
9201          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9202                             SDValue(FalseC, 0));
9203        }
9204
9205        // Optimize cases that will turn into an LEA instruction.  This requires
9206        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9207        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9208          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9209          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9210
9211          bool isFastMultiplier = false;
9212          if (Diff < 10) {
9213            switch ((unsigned char)Diff) {
9214              default: break;
9215              case 1:  // result = add base, cond
9216              case 2:  // result = lea base(    , cond*2)
9217              case 3:  // result = lea base(cond, cond*2)
9218              case 4:  // result = lea base(    , cond*4)
9219              case 5:  // result = lea base(cond, cond*4)
9220              case 8:  // result = lea base(    , cond*8)
9221              case 9:  // result = lea base(cond, cond*8)
9222                isFastMultiplier = true;
9223                break;
9224            }
9225          }
9226
9227          if (isFastMultiplier) {
9228            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9229            if (NeedsCondInvert) // Invert the condition if needed.
9230              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9231                                 DAG.getConstant(1, Cond.getValueType()));
9232
9233            // Zero extend the condition if needed.
9234            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9235                               Cond);
9236            // Scale the condition by the difference.
9237            if (Diff != 1)
9238              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9239                                 DAG.getConstant(Diff, Cond.getValueType()));
9240
9241            // Add the base if non-zero.
9242            if (FalseC->getAPIntValue() != 0)
9243              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9244                                 SDValue(FalseC, 0));
9245            return Cond;
9246          }
9247        }
9248      }
9249  }
9250
9251  return SDValue();
9252}
9253
9254/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
9255static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
9256                                  TargetLowering::DAGCombinerInfo &DCI) {
9257  DebugLoc DL = N->getDebugLoc();
9258
9259  // If the flag operand isn't dead, don't touch this CMOV.
9260  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
9261    return SDValue();
9262
9263  // If this is a select between two integer constants, try to do some
9264  // optimizations.  Note that the operands are ordered the opposite of SELECT
9265  // operands.
9266  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
9267    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
9268      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
9269      // larger than FalseC (the false value).
9270      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
9271
9272      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
9273        CC = X86::GetOppositeBranchCondition(CC);
9274        std::swap(TrueC, FalseC);
9275      }
9276
9277      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
9278      // This is efficient for any integer data type (including i8/i16) and
9279      // shift amount.
9280      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
9281        SDValue Cond = N->getOperand(3);
9282        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9283                           DAG.getConstant(CC, MVT::i8), Cond);
9284
9285        // Zero extend the condition if needed.
9286        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
9287
9288        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9289        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
9290                           DAG.getConstant(ShAmt, MVT::i8));
9291        if (N->getNumValues() == 2)  // Dead flag value?
9292          return DCI.CombineTo(N, Cond, SDValue());
9293        return Cond;
9294      }
9295
9296      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
9297      // for any integer data type, including i8/i16.
9298      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9299        SDValue Cond = N->getOperand(3);
9300        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9301                           DAG.getConstant(CC, MVT::i8), Cond);
9302
9303        // Zero extend the condition if needed.
9304        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9305                           FalseC->getValueType(0), Cond);
9306        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9307                           SDValue(FalseC, 0));
9308
9309        if (N->getNumValues() == 2)  // Dead flag value?
9310          return DCI.CombineTo(N, Cond, SDValue());
9311        return Cond;
9312      }
9313
9314      // Optimize cases that will turn into an LEA instruction.  This requires
9315      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9316      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9317        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9318        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9319
9320        bool isFastMultiplier = false;
9321        if (Diff < 10) {
9322          switch ((unsigned char)Diff) {
9323          default: break;
9324          case 1:  // result = add base, cond
9325          case 2:  // result = lea base(    , cond*2)
9326          case 3:  // result = lea base(cond, cond*2)
9327          case 4:  // result = lea base(    , cond*4)
9328          case 5:  // result = lea base(cond, cond*4)
9329          case 8:  // result = lea base(    , cond*8)
9330          case 9:  // result = lea base(cond, cond*8)
9331            isFastMultiplier = true;
9332            break;
9333          }
9334        }
9335
9336        if (isFastMultiplier) {
9337          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9338          SDValue Cond = N->getOperand(3);
9339          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9340                             DAG.getConstant(CC, MVT::i8), Cond);
9341          // Zero extend the condition if needed.
9342          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9343                             Cond);
9344          // Scale the condition by the difference.
9345          if (Diff != 1)
9346            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9347                               DAG.getConstant(Diff, Cond.getValueType()));
9348
9349          // Add the base if non-zero.
9350          if (FalseC->getAPIntValue() != 0)
9351            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9352                               SDValue(FalseC, 0));
9353          if (N->getNumValues() == 2)  // Dead flag value?
9354            return DCI.CombineTo(N, Cond, SDValue());
9355          return Cond;
9356        }
9357      }
9358    }
9359  }
9360  return SDValue();
9361}
9362
9363
9364/// PerformMulCombine - Optimize a single multiply with constant into two
9365/// in order to implement it with two cheaper instructions, e.g.
9366/// LEA + SHL, LEA + LEA.
9367static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
9368                                 TargetLowering::DAGCombinerInfo &DCI) {
9369  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9370    return SDValue();
9371
9372  EVT VT = N->getValueType(0);
9373  if (VT != MVT::i64)
9374    return SDValue();
9375
9376  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9377  if (!C)
9378    return SDValue();
9379  uint64_t MulAmt = C->getZExtValue();
9380  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
9381    return SDValue();
9382
9383  uint64_t MulAmt1 = 0;
9384  uint64_t MulAmt2 = 0;
9385  if ((MulAmt % 9) == 0) {
9386    MulAmt1 = 9;
9387    MulAmt2 = MulAmt / 9;
9388  } else if ((MulAmt % 5) == 0) {
9389    MulAmt1 = 5;
9390    MulAmt2 = MulAmt / 5;
9391  } else if ((MulAmt % 3) == 0) {
9392    MulAmt1 = 3;
9393    MulAmt2 = MulAmt / 3;
9394  }
9395  if (MulAmt2 &&
9396      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
9397    DebugLoc DL = N->getDebugLoc();
9398
9399    if (isPowerOf2_64(MulAmt2) &&
9400        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
9401      // If second multiplifer is pow2, issue it first. We want the multiply by
9402      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
9403      // is an add.
9404      std::swap(MulAmt1, MulAmt2);
9405
9406    SDValue NewMul;
9407    if (isPowerOf2_64(MulAmt1))
9408      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
9409                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
9410    else
9411      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
9412                           DAG.getConstant(MulAmt1, VT));
9413
9414    if (isPowerOf2_64(MulAmt2))
9415      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
9416                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
9417    else
9418      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
9419                           DAG.getConstant(MulAmt2, VT));
9420
9421    // Do not add new nodes to DAG combiner worklist.
9422    DCI.CombineTo(N, NewMul, false);
9423  }
9424  return SDValue();
9425}
9426
9427static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
9428  SDValue N0 = N->getOperand(0);
9429  SDValue N1 = N->getOperand(1);
9430  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9431  EVT VT = N0.getValueType();
9432
9433  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
9434  // since the result of setcc_c is all zero's or all ones.
9435  if (N1C && N0.getOpcode() == ISD::AND &&
9436      N0.getOperand(1).getOpcode() == ISD::Constant) {
9437    SDValue N00 = N0.getOperand(0);
9438    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
9439        ((N00.getOpcode() == ISD::ANY_EXTEND ||
9440          N00.getOpcode() == ISD::ZERO_EXTEND) &&
9441         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
9442      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
9443      APInt ShAmt = N1C->getAPIntValue();
9444      Mask = Mask.shl(ShAmt);
9445      if (Mask != 0)
9446        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
9447                           N00, DAG.getConstant(Mask, VT));
9448    }
9449  }
9450
9451  return SDValue();
9452}
9453
9454/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
9455///                       when possible.
9456static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
9457                                   const X86Subtarget *Subtarget) {
9458  EVT VT = N->getValueType(0);
9459  if (!VT.isVector() && VT.isInteger() &&
9460      N->getOpcode() == ISD::SHL)
9461    return PerformSHLCombine(N, DAG);
9462
9463  // On X86 with SSE2 support, we can transform this to a vector shift if
9464  // all elements are shifted by the same amount.  We can't do this in legalize
9465  // because the a constant vector is typically transformed to a constant pool
9466  // so we have no knowledge of the shift amount.
9467  if (!Subtarget->hasSSE2())
9468    return SDValue();
9469
9470  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
9471    return SDValue();
9472
9473  SDValue ShAmtOp = N->getOperand(1);
9474  EVT EltVT = VT.getVectorElementType();
9475  DebugLoc DL = N->getDebugLoc();
9476  SDValue BaseShAmt = SDValue();
9477  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
9478    unsigned NumElts = VT.getVectorNumElements();
9479    unsigned i = 0;
9480    for (; i != NumElts; ++i) {
9481      SDValue Arg = ShAmtOp.getOperand(i);
9482      if (Arg.getOpcode() == ISD::UNDEF) continue;
9483      BaseShAmt = Arg;
9484      break;
9485    }
9486    for (; i != NumElts; ++i) {
9487      SDValue Arg = ShAmtOp.getOperand(i);
9488      if (Arg.getOpcode() == ISD::UNDEF) continue;
9489      if (Arg != BaseShAmt) {
9490        return SDValue();
9491      }
9492    }
9493  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
9494             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
9495    SDValue InVec = ShAmtOp.getOperand(0);
9496    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
9497      unsigned NumElts = InVec.getValueType().getVectorNumElements();
9498      unsigned i = 0;
9499      for (; i != NumElts; ++i) {
9500        SDValue Arg = InVec.getOperand(i);
9501        if (Arg.getOpcode() == ISD::UNDEF) continue;
9502        BaseShAmt = Arg;
9503        break;
9504      }
9505    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
9506       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
9507         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
9508         if (C->getZExtValue() == SplatIdx)
9509           BaseShAmt = InVec.getOperand(1);
9510       }
9511    }
9512    if (BaseShAmt.getNode() == 0)
9513      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
9514                              DAG.getIntPtrConstant(0));
9515  } else
9516    return SDValue();
9517
9518  // The shift amount is an i32.
9519  if (EltVT.bitsGT(MVT::i32))
9520    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
9521  else if (EltVT.bitsLT(MVT::i32))
9522    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
9523
9524  // The shift amount is identical so we can do a vector shift.
9525  SDValue  ValOp = N->getOperand(0);
9526  switch (N->getOpcode()) {
9527  default:
9528    llvm_unreachable("Unknown shift opcode!");
9529    break;
9530  case ISD::SHL:
9531    if (VT == MVT::v2i64)
9532      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9533                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9534                         ValOp, BaseShAmt);
9535    if (VT == MVT::v4i32)
9536      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9537                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9538                         ValOp, BaseShAmt);
9539    if (VT == MVT::v8i16)
9540      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9541                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9542                         ValOp, BaseShAmt);
9543    break;
9544  case ISD::SRA:
9545    if (VT == MVT::v4i32)
9546      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9547                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
9548                         ValOp, BaseShAmt);
9549    if (VT == MVT::v8i16)
9550      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9551                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
9552                         ValOp, BaseShAmt);
9553    break;
9554  case ISD::SRL:
9555    if (VT == MVT::v2i64)
9556      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9557                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9558                         ValOp, BaseShAmt);
9559    if (VT == MVT::v4i32)
9560      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9561                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
9562                         ValOp, BaseShAmt);
9563    if (VT ==  MVT::v8i16)
9564      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9565                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
9566                         ValOp, BaseShAmt);
9567    break;
9568  }
9569  return SDValue();
9570}
9571
9572static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
9573                                const X86Subtarget *Subtarget) {
9574  EVT VT = N->getValueType(0);
9575  if (VT != MVT::i64 || !Subtarget->is64Bit())
9576    return SDValue();
9577
9578  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
9579  SDValue N0 = N->getOperand(0);
9580  SDValue N1 = N->getOperand(1);
9581  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
9582    std::swap(N0, N1);
9583  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
9584    return SDValue();
9585
9586  SDValue ShAmt0 = N0.getOperand(1);
9587  if (ShAmt0.getValueType() != MVT::i8)
9588    return SDValue();
9589  SDValue ShAmt1 = N1.getOperand(1);
9590  if (ShAmt1.getValueType() != MVT::i8)
9591    return SDValue();
9592  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
9593    ShAmt0 = ShAmt0.getOperand(0);
9594  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
9595    ShAmt1 = ShAmt1.getOperand(0);
9596
9597  DebugLoc DL = N->getDebugLoc();
9598  unsigned Opc = X86ISD::SHLD;
9599  SDValue Op0 = N0.getOperand(0);
9600  SDValue Op1 = N1.getOperand(0);
9601  if (ShAmt0.getOpcode() == ISD::SUB) {
9602    Opc = X86ISD::SHRD;
9603    std::swap(Op0, Op1);
9604    std::swap(ShAmt0, ShAmt1);
9605  }
9606
9607  if (ShAmt1.getOpcode() == ISD::SUB) {
9608    SDValue Sum = ShAmt1.getOperand(0);
9609    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
9610      if (SumC->getSExtValue() == 64 &&
9611          ShAmt1.getOperand(1) == ShAmt0)
9612        return DAG.getNode(Opc, DL, VT,
9613                           Op0, Op1,
9614                           DAG.getNode(ISD::TRUNCATE, DL,
9615                                       MVT::i8, ShAmt0));
9616    }
9617  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
9618    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
9619    if (ShAmt0C &&
9620        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64)
9621      return DAG.getNode(Opc, DL, VT,
9622                         N0.getOperand(0), N1.getOperand(0),
9623                         DAG.getNode(ISD::TRUNCATE, DL,
9624                                       MVT::i8, ShAmt0));
9625  }
9626
9627  return SDValue();
9628}
9629
9630/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
9631static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
9632                                   const X86Subtarget *Subtarget) {
9633  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
9634  // the FP state in cases where an emms may be missing.
9635  // A preferable solution to the general problem is to figure out the right
9636  // places to insert EMMS.  This qualifies as a quick hack.
9637
9638  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
9639  StoreSDNode *St = cast<StoreSDNode>(N);
9640  EVT VT = St->getValue().getValueType();
9641  if (VT.getSizeInBits() != 64)
9642    return SDValue();
9643
9644  const Function *F = DAG.getMachineFunction().getFunction();
9645  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
9646  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
9647    && Subtarget->hasSSE2();
9648  if ((VT.isVector() ||
9649       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
9650      isa<LoadSDNode>(St->getValue()) &&
9651      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
9652      St->getChain().hasOneUse() && !St->isVolatile()) {
9653    SDNode* LdVal = St->getValue().getNode();
9654    LoadSDNode *Ld = 0;
9655    int TokenFactorIndex = -1;
9656    SmallVector<SDValue, 8> Ops;
9657    SDNode* ChainVal = St->getChain().getNode();
9658    // Must be a store of a load.  We currently handle two cases:  the load
9659    // is a direct child, and it's under an intervening TokenFactor.  It is
9660    // possible to dig deeper under nested TokenFactors.
9661    if (ChainVal == LdVal)
9662      Ld = cast<LoadSDNode>(St->getChain());
9663    else if (St->getValue().hasOneUse() &&
9664             ChainVal->getOpcode() == ISD::TokenFactor) {
9665      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
9666        if (ChainVal->getOperand(i).getNode() == LdVal) {
9667          TokenFactorIndex = i;
9668          Ld = cast<LoadSDNode>(St->getValue());
9669        } else
9670          Ops.push_back(ChainVal->getOperand(i));
9671      }
9672    }
9673
9674    if (!Ld || !ISD::isNormalLoad(Ld))
9675      return SDValue();
9676
9677    // If this is not the MMX case, i.e. we are just turning i64 load/store
9678    // into f64 load/store, avoid the transformation if there are multiple
9679    // uses of the loaded value.
9680    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
9681      return SDValue();
9682
9683    DebugLoc LdDL = Ld->getDebugLoc();
9684    DebugLoc StDL = N->getDebugLoc();
9685    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
9686    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
9687    // pair instead.
9688    if (Subtarget->is64Bit() || F64IsLegal) {
9689      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
9690      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
9691                                  Ld->getBasePtr(), Ld->getSrcValue(),
9692                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
9693                                  Ld->isNonTemporal(), Ld->getAlignment());
9694      SDValue NewChain = NewLd.getValue(1);
9695      if (TokenFactorIndex != -1) {
9696        Ops.push_back(NewChain);
9697        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9698                               Ops.size());
9699      }
9700      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
9701                          St->getSrcValue(), St->getSrcValueOffset(),
9702                          St->isVolatile(), St->isNonTemporal(),
9703                          St->getAlignment());
9704    }
9705
9706    // Otherwise, lower to two pairs of 32-bit loads / stores.
9707    SDValue LoAddr = Ld->getBasePtr();
9708    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
9709                                 DAG.getConstant(4, MVT::i32));
9710
9711    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
9712                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
9713                               Ld->isVolatile(), Ld->isNonTemporal(),
9714                               Ld->getAlignment());
9715    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
9716                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
9717                               Ld->isVolatile(), Ld->isNonTemporal(),
9718                               MinAlign(Ld->getAlignment(), 4));
9719
9720    SDValue NewChain = LoLd.getValue(1);
9721    if (TokenFactorIndex != -1) {
9722      Ops.push_back(LoLd);
9723      Ops.push_back(HiLd);
9724      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9725                             Ops.size());
9726    }
9727
9728    LoAddr = St->getBasePtr();
9729    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
9730                         DAG.getConstant(4, MVT::i32));
9731
9732    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
9733                                St->getSrcValue(), St->getSrcValueOffset(),
9734                                St->isVolatile(), St->isNonTemporal(),
9735                                St->getAlignment());
9736    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
9737                                St->getSrcValue(),
9738                                St->getSrcValueOffset() + 4,
9739                                St->isVolatile(),
9740                                St->isNonTemporal(),
9741                                MinAlign(St->getAlignment(), 4));
9742    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
9743  }
9744  return SDValue();
9745}
9746
9747/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
9748/// X86ISD::FXOR nodes.
9749static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
9750  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
9751  // F[X]OR(0.0, x) -> x
9752  // F[X]OR(x, 0.0) -> x
9753  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9754    if (C->getValueAPF().isPosZero())
9755      return N->getOperand(1);
9756  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9757    if (C->getValueAPF().isPosZero())
9758      return N->getOperand(0);
9759  return SDValue();
9760}
9761
9762/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
9763static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
9764  // FAND(0.0, x) -> 0.0
9765  // FAND(x, 0.0) -> 0.0
9766  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9767    if (C->getValueAPF().isPosZero())
9768      return N->getOperand(0);
9769  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9770    if (C->getValueAPF().isPosZero())
9771      return N->getOperand(1);
9772  return SDValue();
9773}
9774
9775static SDValue PerformBTCombine(SDNode *N,
9776                                SelectionDAG &DAG,
9777                                TargetLowering::DAGCombinerInfo &DCI) {
9778  // BT ignores high bits in the bit index operand.
9779  SDValue Op1 = N->getOperand(1);
9780  if (Op1.hasOneUse()) {
9781    unsigned BitWidth = Op1.getValueSizeInBits();
9782    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
9783    APInt KnownZero, KnownOne;
9784    TargetLowering::TargetLoweringOpt TLO(DAG);
9785    TargetLowering &TLI = DAG.getTargetLoweringInfo();
9786    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
9787        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
9788      DCI.CommitTargetLoweringOpt(TLO);
9789  }
9790  return SDValue();
9791}
9792
9793static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
9794  SDValue Op = N->getOperand(0);
9795  if (Op.getOpcode() == ISD::BIT_CONVERT)
9796    Op = Op.getOperand(0);
9797  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
9798  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
9799      VT.getVectorElementType().getSizeInBits() ==
9800      OpVT.getVectorElementType().getSizeInBits()) {
9801    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
9802  }
9803  return SDValue();
9804}
9805
9806// On X86 and X86-64, atomic operations are lowered to locked instructions.
9807// Locked instructions, in turn, have implicit fence semantics (all memory
9808// operations are flushed before issuing the locked instruction, and the
9809// are not buffered), so we can fold away the common pattern of
9810// fence-atomic-fence.
9811static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
9812  SDValue atomic = N->getOperand(0);
9813  switch (atomic.getOpcode()) {
9814    case ISD::ATOMIC_CMP_SWAP:
9815    case ISD::ATOMIC_SWAP:
9816    case ISD::ATOMIC_LOAD_ADD:
9817    case ISD::ATOMIC_LOAD_SUB:
9818    case ISD::ATOMIC_LOAD_AND:
9819    case ISD::ATOMIC_LOAD_OR:
9820    case ISD::ATOMIC_LOAD_XOR:
9821    case ISD::ATOMIC_LOAD_NAND:
9822    case ISD::ATOMIC_LOAD_MIN:
9823    case ISD::ATOMIC_LOAD_MAX:
9824    case ISD::ATOMIC_LOAD_UMIN:
9825    case ISD::ATOMIC_LOAD_UMAX:
9826      break;
9827    default:
9828      return SDValue();
9829  }
9830
9831  SDValue fence = atomic.getOperand(0);
9832  if (fence.getOpcode() != ISD::MEMBARRIER)
9833    return SDValue();
9834
9835  switch (atomic.getOpcode()) {
9836    case ISD::ATOMIC_CMP_SWAP:
9837      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
9838                                    atomic.getOperand(1), atomic.getOperand(2),
9839                                    atomic.getOperand(3));
9840    case ISD::ATOMIC_SWAP:
9841    case ISD::ATOMIC_LOAD_ADD:
9842    case ISD::ATOMIC_LOAD_SUB:
9843    case ISD::ATOMIC_LOAD_AND:
9844    case ISD::ATOMIC_LOAD_OR:
9845    case ISD::ATOMIC_LOAD_XOR:
9846    case ISD::ATOMIC_LOAD_NAND:
9847    case ISD::ATOMIC_LOAD_MIN:
9848    case ISD::ATOMIC_LOAD_MAX:
9849    case ISD::ATOMIC_LOAD_UMIN:
9850    case ISD::ATOMIC_LOAD_UMAX:
9851      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
9852                                    atomic.getOperand(1), atomic.getOperand(2));
9853    default:
9854      return SDValue();
9855  }
9856}
9857
9858static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
9859  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
9860  //           (and (i32 x86isd::setcc_carry), 1)
9861  // This eliminates the zext. This transformation is necessary because
9862  // ISD::SETCC is always legalized to i8.
9863  DebugLoc dl = N->getDebugLoc();
9864  SDValue N0 = N->getOperand(0);
9865  EVT VT = N->getValueType(0);
9866  if (N0.getOpcode() == ISD::AND &&
9867      N0.hasOneUse() &&
9868      N0.getOperand(0).hasOneUse()) {
9869    SDValue N00 = N0.getOperand(0);
9870    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
9871      return SDValue();
9872    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9873    if (!C || C->getZExtValue() != 1)
9874      return SDValue();
9875    return DAG.getNode(ISD::AND, dl, VT,
9876                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
9877                                   N00.getOperand(0), N00.getOperand(1)),
9878                       DAG.getConstant(1, VT));
9879  }
9880
9881  return SDValue();
9882}
9883
9884SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
9885                                             DAGCombinerInfo &DCI) const {
9886  SelectionDAG &DAG = DCI.DAG;
9887  switch (N->getOpcode()) {
9888  default: break;
9889  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
9890  case ISD::EXTRACT_VECTOR_ELT:
9891                        return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
9892  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
9893  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
9894  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
9895  case ISD::SHL:
9896  case ISD::SRA:
9897  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
9898  case ISD::OR:             return PerformOrCombine(N, DAG, Subtarget);
9899  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
9900  case X86ISD::FXOR:
9901  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
9902  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
9903  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
9904  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
9905  case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
9906  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
9907  }
9908
9909  return SDValue();
9910}
9911
9912/// PerformDAGCombinePromotion - This method query the target whether it is
9913/// beneficial for dag combiner to promote the specified node. If true, it
9914/// should return the desired promotion type by reference.
9915bool X86TargetLowering::PerformDAGCombinePromotion(SDValue Op, EVT &PVT) const {
9916  if (!Promote16Bit)
9917    return false;
9918
9919  EVT VT = Op.getValueType();
9920  if (VT != MVT::i16)
9921    return false;
9922
9923  bool Commute = true;
9924  switch (Op.getOpcode()) {
9925  default: return false;
9926  case ISD::SUB:
9927    Commute = false;
9928    // fallthrough
9929  case ISD::ADD:
9930  case ISD::MUL:
9931  case ISD::AND:
9932  case ISD::OR:
9933  case ISD::XOR: {
9934    SDValue N0 = Op.getOperand(0);
9935    SDValue N1 = Op.getOperand(1);
9936    if (!Commute && isa<LoadSDNode>(N1))
9937      return false;
9938    // Avoid disabling potential load folding opportunities.
9939    if ((isa<LoadSDNode>(N0) && N0.hasOneUse()) && !isa<ConstantSDNode>(N1))
9940      return false;
9941    if ((isa<LoadSDNode>(N1) && N1.hasOneUse()) && !isa<ConstantSDNode>(N0))
9942      return false;
9943  }
9944  }
9945
9946  PVT = MVT::i32;
9947  return true;
9948}
9949
9950//===----------------------------------------------------------------------===//
9951//                           X86 Inline Assembly Support
9952//===----------------------------------------------------------------------===//
9953
9954static bool LowerToBSwap(CallInst *CI) {
9955  // FIXME: this should verify that we are targetting a 486 or better.  If not,
9956  // we will turn this bswap into something that will be lowered to logical ops
9957  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
9958  // so don't worry about this.
9959
9960  // Verify this is a simple bswap.
9961  if (CI->getNumOperands() != 2 ||
9962      CI->getType() != CI->getOperand(0)->getType() ||
9963      !CI->getType()->isIntegerTy())
9964    return false;
9965
9966  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
9967  if (!Ty || Ty->getBitWidth() % 16 != 0)
9968    return false;
9969
9970  // Okay, we can do this xform, do so now.
9971  const Type *Tys[] = { Ty };
9972  Module *M = CI->getParent()->getParent()->getParent();
9973  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
9974
9975  Value *Op = CI->getOperand(0);
9976  Op = CallInst::Create(Int, Op, CI->getName(), CI);
9977
9978  CI->replaceAllUsesWith(Op);
9979  CI->eraseFromParent();
9980  return true;
9981}
9982
9983bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
9984  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
9985  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
9986
9987  std::string AsmStr = IA->getAsmString();
9988
9989  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
9990  SmallVector<StringRef, 4> AsmPieces;
9991  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
9992
9993  switch (AsmPieces.size()) {
9994  default: return false;
9995  case 1:
9996    AsmStr = AsmPieces[0];
9997    AsmPieces.clear();
9998    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
9999
10000    // bswap $0
10001    if (AsmPieces.size() == 2 &&
10002        (AsmPieces[0] == "bswap" ||
10003         AsmPieces[0] == "bswapq" ||
10004         AsmPieces[0] == "bswapl") &&
10005        (AsmPieces[1] == "$0" ||
10006         AsmPieces[1] == "${0:q}")) {
10007      // No need to check constraints, nothing other than the equivalent of
10008      // "=r,0" would be valid here.
10009      return LowerToBSwap(CI);
10010    }
10011    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
10012    if (CI->getType()->isIntegerTy(16) &&
10013        AsmPieces.size() == 3 &&
10014        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
10015        AsmPieces[1] == "$$8," &&
10016        AsmPieces[2] == "${0:w}" &&
10017        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
10018      AsmPieces.clear();
10019      const std::string &Constraints = IA->getConstraintString();
10020      SplitString(StringRef(Constraints).substr(5), AsmPieces, ",");
10021      std::sort(AsmPieces.begin(), AsmPieces.end());
10022      if (AsmPieces.size() == 4 &&
10023          AsmPieces[0] == "~{cc}" &&
10024          AsmPieces[1] == "~{dirflag}" &&
10025          AsmPieces[2] == "~{flags}" &&
10026          AsmPieces[3] == "~{fpsr}") {
10027        return LowerToBSwap(CI);
10028      }
10029    }
10030    break;
10031  case 3:
10032    if (CI->getType()->isIntegerTy(64) &&
10033        Constraints.size() >= 2 &&
10034        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
10035        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
10036      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
10037      SmallVector<StringRef, 4> Words;
10038      SplitString(AsmPieces[0], Words, " \t");
10039      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
10040        Words.clear();
10041        SplitString(AsmPieces[1], Words, " \t");
10042        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
10043          Words.clear();
10044          SplitString(AsmPieces[2], Words, " \t,");
10045          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
10046              Words[2] == "%edx") {
10047            return LowerToBSwap(CI);
10048          }
10049        }
10050      }
10051    }
10052    break;
10053  }
10054  return false;
10055}
10056
10057
10058
10059/// getConstraintType - Given a constraint letter, return the type of
10060/// constraint it is for this target.
10061X86TargetLowering::ConstraintType
10062X86TargetLowering::getConstraintType(const std::string &Constraint) const {
10063  if (Constraint.size() == 1) {
10064    switch (Constraint[0]) {
10065    case 'A':
10066      return C_Register;
10067    case 'f':
10068    case 'r':
10069    case 'R':
10070    case 'l':
10071    case 'q':
10072    case 'Q':
10073    case 'x':
10074    case 'y':
10075    case 'Y':
10076      return C_RegisterClass;
10077    case 'e':
10078    case 'Z':
10079      return C_Other;
10080    default:
10081      break;
10082    }
10083  }
10084  return TargetLowering::getConstraintType(Constraint);
10085}
10086
10087/// LowerXConstraint - try to replace an X constraint, which matches anything,
10088/// with another that has more specific requirements based on the type of the
10089/// corresponding operand.
10090const char *X86TargetLowering::
10091LowerXConstraint(EVT ConstraintVT) const {
10092  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
10093  // 'f' like normal targets.
10094  if (ConstraintVT.isFloatingPoint()) {
10095    if (Subtarget->hasSSE2())
10096      return "Y";
10097    if (Subtarget->hasSSE1())
10098      return "x";
10099  }
10100
10101  return TargetLowering::LowerXConstraint(ConstraintVT);
10102}
10103
10104/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10105/// vector.  If it is invalid, don't add anything to Ops.
10106void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
10107                                                     char Constraint,
10108                                                     bool hasMemory,
10109                                                     std::vector<SDValue>&Ops,
10110                                                     SelectionDAG &DAG) const {
10111  SDValue Result(0, 0);
10112
10113  switch (Constraint) {
10114  default: break;
10115  case 'I':
10116    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10117      if (C->getZExtValue() <= 31) {
10118        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10119        break;
10120      }
10121    }
10122    return;
10123  case 'J':
10124    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10125      if (C->getZExtValue() <= 63) {
10126        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10127        break;
10128      }
10129    }
10130    return;
10131  case 'K':
10132    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10133      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
10134        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10135        break;
10136      }
10137    }
10138    return;
10139  case 'N':
10140    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10141      if (C->getZExtValue() <= 255) {
10142        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10143        break;
10144      }
10145    }
10146    return;
10147  case 'e': {
10148    // 32-bit signed value
10149    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10150      const ConstantInt *CI = C->getConstantIntValue();
10151      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
10152                                  C->getSExtValue())) {
10153        // Widen to 64 bits here to get it sign extended.
10154        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
10155        break;
10156      }
10157    // FIXME gcc accepts some relocatable values here too, but only in certain
10158    // memory models; it's complicated.
10159    }
10160    return;
10161  }
10162  case 'Z': {
10163    // 32-bit unsigned value
10164    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10165      const ConstantInt *CI = C->getConstantIntValue();
10166      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
10167                                  C->getZExtValue())) {
10168        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10169        break;
10170      }
10171    }
10172    // FIXME gcc accepts some relocatable values here too, but only in certain
10173    // memory models; it's complicated.
10174    return;
10175  }
10176  case 'i': {
10177    // Literal immediates are always ok.
10178    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
10179      // Widen to 64 bits here to get it sign extended.
10180      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
10181      break;
10182    }
10183
10184    // If we are in non-pic codegen mode, we allow the address of a global (with
10185    // an optional displacement) to be used with 'i'.
10186    GlobalAddressSDNode *GA = 0;
10187    int64_t Offset = 0;
10188
10189    // Match either (GA), (GA+C), (GA+C1+C2), etc.
10190    while (1) {
10191      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
10192        Offset += GA->getOffset();
10193        break;
10194      } else if (Op.getOpcode() == ISD::ADD) {
10195        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
10196          Offset += C->getZExtValue();
10197          Op = Op.getOperand(0);
10198          continue;
10199        }
10200      } else if (Op.getOpcode() == ISD::SUB) {
10201        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
10202          Offset += -C->getZExtValue();
10203          Op = Op.getOperand(0);
10204          continue;
10205        }
10206      }
10207
10208      // Otherwise, this isn't something we can handle, reject it.
10209      return;
10210    }
10211
10212    const GlobalValue *GV = GA->getGlobal();
10213    // If we require an extra load to get this address, as in PIC mode, we
10214    // can't accept it.
10215    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
10216                                                        getTargetMachine())))
10217      return;
10218
10219    if (hasMemory)
10220      Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
10221    else
10222      Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
10223    Result = Op;
10224    break;
10225  }
10226  }
10227
10228  if (Result.getNode()) {
10229    Ops.push_back(Result);
10230    return;
10231  }
10232  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
10233                                                      Ops, DAG);
10234}
10235
10236std::vector<unsigned> X86TargetLowering::
10237getRegClassForInlineAsmConstraint(const std::string &Constraint,
10238                                  EVT VT) const {
10239  if (Constraint.size() == 1) {
10240    // FIXME: not handling fp-stack yet!
10241    switch (Constraint[0]) {      // GCC X86 Constraint Letters
10242    default: break;  // Unknown constraint letter
10243    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
10244      if (Subtarget->is64Bit()) {
10245        if (VT == MVT::i32)
10246          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
10247                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
10248                                       X86::R10D,X86::R11D,X86::R12D,
10249                                       X86::R13D,X86::R14D,X86::R15D,
10250                                       X86::EBP, X86::ESP, 0);
10251        else if (VT == MVT::i16)
10252          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
10253                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
10254                                       X86::R10W,X86::R11W,X86::R12W,
10255                                       X86::R13W,X86::R14W,X86::R15W,
10256                                       X86::BP,  X86::SP, 0);
10257        else if (VT == MVT::i8)
10258          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
10259                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
10260                                       X86::R10B,X86::R11B,X86::R12B,
10261                                       X86::R13B,X86::R14B,X86::R15B,
10262                                       X86::BPL, X86::SPL, 0);
10263
10264        else if (VT == MVT::i64)
10265          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
10266                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
10267                                       X86::R10, X86::R11, X86::R12,
10268                                       X86::R13, X86::R14, X86::R15,
10269                                       X86::RBP, X86::RSP, 0);
10270
10271        break;
10272      }
10273      // 32-bit fallthrough
10274    case 'Q':   // Q_REGS
10275      if (VT == MVT::i32)
10276        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
10277      else if (VT == MVT::i16)
10278        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
10279      else if (VT == MVT::i8)
10280        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
10281      else if (VT == MVT::i64)
10282        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
10283      break;
10284    }
10285  }
10286
10287  return std::vector<unsigned>();
10288}
10289
10290std::pair<unsigned, const TargetRegisterClass*>
10291X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
10292                                                EVT VT) const {
10293  // First, see if this is a constraint that directly corresponds to an LLVM
10294  // register class.
10295  if (Constraint.size() == 1) {
10296    // GCC Constraint Letters
10297    switch (Constraint[0]) {
10298    default: break;
10299    case 'r':   // GENERAL_REGS
10300    case 'l':   // INDEX_REGS
10301      if (VT == MVT::i8)
10302        return std::make_pair(0U, X86::GR8RegisterClass);
10303      if (VT == MVT::i16)
10304        return std::make_pair(0U, X86::GR16RegisterClass);
10305      if (VT == MVT::i32 || !Subtarget->is64Bit())
10306        return std::make_pair(0U, X86::GR32RegisterClass);
10307      return std::make_pair(0U, X86::GR64RegisterClass);
10308    case 'R':   // LEGACY_REGS
10309      if (VT == MVT::i8)
10310        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
10311      if (VT == MVT::i16)
10312        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
10313      if (VT == MVT::i32 || !Subtarget->is64Bit())
10314        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
10315      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
10316    case 'f':  // FP Stack registers.
10317      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
10318      // value to the correct fpstack register class.
10319      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
10320        return std::make_pair(0U, X86::RFP32RegisterClass);
10321      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
10322        return std::make_pair(0U, X86::RFP64RegisterClass);
10323      return std::make_pair(0U, X86::RFP80RegisterClass);
10324    case 'y':   // MMX_REGS if MMX allowed.
10325      if (!Subtarget->hasMMX()) break;
10326      return std::make_pair(0U, X86::VR64RegisterClass);
10327    case 'Y':   // SSE_REGS if SSE2 allowed
10328      if (!Subtarget->hasSSE2()) break;
10329      // FALL THROUGH.
10330    case 'x':   // SSE_REGS if SSE1 allowed
10331      if (!Subtarget->hasSSE1()) break;
10332
10333      switch (VT.getSimpleVT().SimpleTy) {
10334      default: break;
10335      // Scalar SSE types.
10336      case MVT::f32:
10337      case MVT::i32:
10338        return std::make_pair(0U, X86::FR32RegisterClass);
10339      case MVT::f64:
10340      case MVT::i64:
10341        return std::make_pair(0U, X86::FR64RegisterClass);
10342      // Vector types.
10343      case MVT::v16i8:
10344      case MVT::v8i16:
10345      case MVT::v4i32:
10346      case MVT::v2i64:
10347      case MVT::v4f32:
10348      case MVT::v2f64:
10349        return std::make_pair(0U, X86::VR128RegisterClass);
10350      }
10351      break;
10352    }
10353  }
10354
10355  // Use the default implementation in TargetLowering to convert the register
10356  // constraint into a member of a register class.
10357  std::pair<unsigned, const TargetRegisterClass*> Res;
10358  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
10359
10360  // Not found as a standard register?
10361  if (Res.second == 0) {
10362    // Map st(0) -> st(7) -> ST0
10363    if (Constraint.size() == 7 && Constraint[0] == '{' &&
10364        tolower(Constraint[1]) == 's' &&
10365        tolower(Constraint[2]) == 't' &&
10366        Constraint[3] == '(' &&
10367        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
10368        Constraint[5] == ')' &&
10369        Constraint[6] == '}') {
10370
10371      Res.first = X86::ST0+Constraint[4]-'0';
10372      Res.second = X86::RFP80RegisterClass;
10373      return Res;
10374    }
10375
10376    // GCC allows "st(0)" to be called just plain "st".
10377    if (StringRef("{st}").equals_lower(Constraint)) {
10378      Res.first = X86::ST0;
10379      Res.second = X86::RFP80RegisterClass;
10380      return Res;
10381    }
10382
10383    // flags -> EFLAGS
10384    if (StringRef("{flags}").equals_lower(Constraint)) {
10385      Res.first = X86::EFLAGS;
10386      Res.second = X86::CCRRegisterClass;
10387      return Res;
10388    }
10389
10390    // 'A' means EAX + EDX.
10391    if (Constraint == "A") {
10392      Res.first = X86::EAX;
10393      Res.second = X86::GR32_ADRegisterClass;
10394      return Res;
10395    }
10396    return Res;
10397  }
10398
10399  // Otherwise, check to see if this is a register class of the wrong value
10400  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
10401  // turn into {ax},{dx}.
10402  if (Res.second->hasType(VT))
10403    return Res;   // Correct type already, nothing to do.
10404
10405  // All of the single-register GCC register classes map their values onto
10406  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
10407  // really want an 8-bit or 32-bit register, map to the appropriate register
10408  // class and return the appropriate register.
10409  if (Res.second == X86::GR16RegisterClass) {
10410    if (VT == MVT::i8) {
10411      unsigned DestReg = 0;
10412      switch (Res.first) {
10413      default: break;
10414      case X86::AX: DestReg = X86::AL; break;
10415      case X86::DX: DestReg = X86::DL; break;
10416      case X86::CX: DestReg = X86::CL; break;
10417      case X86::BX: DestReg = X86::BL; break;
10418      }
10419      if (DestReg) {
10420        Res.first = DestReg;
10421        Res.second = X86::GR8RegisterClass;
10422      }
10423    } else if (VT == MVT::i32) {
10424      unsigned DestReg = 0;
10425      switch (Res.first) {
10426      default: break;
10427      case X86::AX: DestReg = X86::EAX; break;
10428      case X86::DX: DestReg = X86::EDX; break;
10429      case X86::CX: DestReg = X86::ECX; break;
10430      case X86::BX: DestReg = X86::EBX; break;
10431      case X86::SI: DestReg = X86::ESI; break;
10432      case X86::DI: DestReg = X86::EDI; break;
10433      case X86::BP: DestReg = X86::EBP; break;
10434      case X86::SP: DestReg = X86::ESP; break;
10435      }
10436      if (DestReg) {
10437        Res.first = DestReg;
10438        Res.second = X86::GR32RegisterClass;
10439      }
10440    } else if (VT == MVT::i64) {
10441      unsigned DestReg = 0;
10442      switch (Res.first) {
10443      default: break;
10444      case X86::AX: DestReg = X86::RAX; break;
10445      case X86::DX: DestReg = X86::RDX; break;
10446      case X86::CX: DestReg = X86::RCX; break;
10447      case X86::BX: DestReg = X86::RBX; break;
10448      case X86::SI: DestReg = X86::RSI; break;
10449      case X86::DI: DestReg = X86::RDI; break;
10450      case X86::BP: DestReg = X86::RBP; break;
10451      case X86::SP: DestReg = X86::RSP; break;
10452      }
10453      if (DestReg) {
10454        Res.first = DestReg;
10455        Res.second = X86::GR64RegisterClass;
10456      }
10457    }
10458  } else if (Res.second == X86::FR32RegisterClass ||
10459             Res.second == X86::FR64RegisterClass ||
10460             Res.second == X86::VR128RegisterClass) {
10461    // Handle references to XMM physical registers that got mapped into the
10462    // wrong class.  This can happen with constraints like {xmm0} where the
10463    // target independent register mapper will just pick the first match it can
10464    // find, ignoring the required type.
10465    if (VT == MVT::f32)
10466      Res.second = X86::FR32RegisterClass;
10467    else if (VT == MVT::f64)
10468      Res.second = X86::FR64RegisterClass;
10469    else if (X86::VR128RegisterClass->hasType(VT))
10470      Res.second = X86::VR128RegisterClass;
10471  }
10472
10473  return Res;
10474}
10475