X86ISelLowering.cpp revision 90eb4024ba1ff2b945b0c157910dd41cd4e74575
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86.h"
17#include "X86InstrBuilder.h"
18#include "X86ISelLowering.h"
19#include "X86TargetMachine.h"
20#include "X86TargetObjectFile.h"
21#include "llvm/CallingConv.h"
22#include "llvm/Constants.h"
23#include "llvm/DerivedTypes.h"
24#include "llvm/GlobalAlias.h"
25#include "llvm/GlobalVariable.h"
26#include "llvm/Function.h"
27#include "llvm/Instructions.h"
28#include "llvm/Intrinsics.h"
29#include "llvm/LLVMContext.h"
30#include "llvm/CodeGen/MachineFrameInfo.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineJumpTableInfo.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/PseudoSourceValue.h"
37#include "llvm/MC/MCAsmInfo.h"
38#include "llvm/MC/MCContext.h"
39#include "llvm/MC/MCExpr.h"
40#include "llvm/MC/MCSymbol.h"
41#include "llvm/ADT/BitVector.h"
42#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
44#include "llvm/ADT/StringExtras.h"
45#include "llvm/ADT/VectorExtras.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/Dwarf.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/MathExtras.h"
51#include "llvm/Support/raw_ostream.h"
52using namespace llvm;
53using namespace dwarf;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
59
60// Forward declarations.
61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
62                       SDValue V2);
63
64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
65
66  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
67
68  if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) {
69    if (is64Bit) return new X8664_MachoTargetObjectFile();
70    return new TargetLoweringObjectFileMachO();
71  } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){
72    if (is64Bit) return new X8664_ELFTargetObjectFile(TM);
73    return new X8632_ELFTargetObjectFile(TM);
74  } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) {
75    return new TargetLoweringObjectFileCOFF();
76  }
77  llvm_unreachable("unknown subtarget type");
78}
79
80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
81  : TargetLowering(TM, createTLOF(TM)) {
82  Subtarget = &TM.getSubtarget<X86Subtarget>();
83  X86ScalarSSEf64 = Subtarget->hasSSE2();
84  X86ScalarSSEf32 = Subtarget->hasSSE1();
85  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
86
87  RegInfo = TM.getRegisterInfo();
88  TD = getTargetData();
89
90  // Set up the TargetLowering object.
91
92  // X86 is weird, it always uses i8 for shift amounts and setcc results.
93  setShiftAmountType(MVT::i8);
94  setBooleanContents(ZeroOrOneBooleanContent);
95  setSchedulingPreference(Sched::RegPressure);
96  setStackPointerRegisterToSaveRestore(X86StackPtr);
97
98  if (Subtarget->isTargetDarwin()) {
99    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
100    setUseUnderscoreSetJmp(false);
101    setUseUnderscoreLongJmp(false);
102  } else if (Subtarget->isTargetMingw()) {
103    // MS runtime is weird: it exports _setjmp, but longjmp!
104    setUseUnderscoreSetJmp(true);
105    setUseUnderscoreLongJmp(false);
106  } else {
107    setUseUnderscoreSetJmp(true);
108    setUseUnderscoreLongJmp(true);
109  }
110
111  // Set up the register classes.
112  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
113  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
114  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
115  if (Subtarget->is64Bit())
116    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
117
118  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
119
120  // We don't accept any truncstore of integer registers.
121  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
122  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
123  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
124  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
125  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
126  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
127
128  // SETOEQ and SETUNE require checking two conditions.
129  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
130  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
131  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
132  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
133  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
134  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
135
136  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
137  // operation.
138  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
139  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
140  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
141
142  if (Subtarget->is64Bit()) {
143    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
144    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
145  } else if (!UseSoftFloat) {
146    // We have an algorithm for SSE2->double, and we turn this into a
147    // 64-bit FILD followed by conditional FADD for other targets.
148    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
149    // We have an algorithm for SSE2, and we turn this into a 64-bit
150    // FILD for other targets.
151    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
152  }
153
154  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
155  // this operation.
156  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
157  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
158
159  if (!UseSoftFloat) {
160    // SSE has no i16 to fp conversion, only i32
161    if (X86ScalarSSEf32) {
162      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
163      // f32 and f64 cases are Legal, f80 case is not
164      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
165    } else {
166      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
167      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
168    }
169  } else {
170    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
171    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
172  }
173
174  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
175  // are Legal, f80 is custom lowered.
176  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
177  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
178
179  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
180  // this operation.
181  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
182  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
183
184  if (X86ScalarSSEf32) {
185    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
186    // f32 and f64 cases are Legal, f80 case is not
187    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
188  } else {
189    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
190    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
191  }
192
193  // Handle FP_TO_UINT by promoting the destination to a larger signed
194  // conversion.
195  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
196  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
197  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
198
199  if (Subtarget->is64Bit()) {
200    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
201    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
202  } else if (!UseSoftFloat) {
203    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
204      // Expand FP_TO_UINT into a select.
205      // FIXME: We would like to use a Custom expander here eventually to do
206      // the optimal thing for SSE vs. the default expansion in the legalizer.
207      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
208    else
209      // With SSE3 we can use fisttpll to convert to a signed i64; without
210      // SSE, we're stuck with a fistpll.
211      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
212  }
213
214  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
215  if (!X86ScalarSSEf64) {
216    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
217    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
218    if (Subtarget->is64Bit()) {
219      setOperationAction(ISD::BIT_CONVERT    , MVT::f64  , Expand);
220      // Without SSE, i64->f64 goes through memory; i64->MMX is Legal.
221      if (Subtarget->hasMMX() && !DisableMMX)
222        setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Custom);
223      else
224        setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Expand);
225    }
226  }
227
228  // Scalar integer divide and remainder are lowered to use operations that
229  // produce two results, to match the available instructions. This exposes
230  // the two-result form to trivial CSE, which is able to combine x/y and x%y
231  // into a single instruction.
232  //
233  // Scalar integer multiply-high is also lowered to use two-result
234  // operations, to match the available instructions. However, plain multiply
235  // (low) operations are left as Legal, as there are single-result
236  // instructions for this in x86. Using the two-result multiply instructions
237  // when both high and low results are needed must be arranged by dagcombine.
238  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
239  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
240  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
241  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
242  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
243  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
244  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
245  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
246  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
247  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
248  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
249  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
250  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
251  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
252  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
253  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
254  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
255  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
256  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
257  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
258  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
259  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
260  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
261  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
262
263  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
264  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
265  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
266  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
267  if (Subtarget->is64Bit())
268    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
269  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
270  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
271  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
272  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
273  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
274  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
275  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
276  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
277
278  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
279  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
280  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
281  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
282  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
283  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
284  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
285  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
286  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
287  if (Subtarget->is64Bit()) {
288    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
289    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
290    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
291  }
292
293  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
294  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
295
296  // These should be promoted to a larger select which is supported.
297  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
298  // X86 wants to expand cmov itself.
299  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
300  setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
301  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
302  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
303  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
304  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
305  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
306  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
307  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
308  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
309  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
310  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
311  if (Subtarget->is64Bit()) {
312    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
313    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
314  }
315  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
316
317  // Darwin ABI issue.
318  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
319  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
320  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
321  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
322  if (Subtarget->is64Bit())
323    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
324  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
325  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
326  if (Subtarget->is64Bit()) {
327    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
328    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
329    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
330    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
331    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
332  }
333  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
334  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
335  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
336  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
337  if (Subtarget->is64Bit()) {
338    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
339    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
340    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
341  }
342
343  if (Subtarget->hasSSE1())
344    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
345
346  if (!Subtarget->hasSSE2())
347    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
348  // On X86 and X86-64, atomic operations are lowered to locked instructions.
349  // Locked instructions, in turn, have implicit fence semantics (all memory
350  // operations are flushed before issuing the locked instruction, and they
351  // are not buffered), so we can fold away the common pattern of
352  // fence-atomic-fence.
353  setShouldFoldAtomicFences(true);
354
355  // Expand certain atomics
356  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
357  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
358  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
359  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
360
361  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
362  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
363  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
364  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
365
366  if (!Subtarget->is64Bit()) {
367    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
368    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
369    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
370    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
371    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
372    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
373    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
374  }
375
376  // FIXME - use subtarget debug flags
377  if (!Subtarget->isTargetDarwin() &&
378      !Subtarget->isTargetELF() &&
379      !Subtarget->isTargetCygMing()) {
380    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
381  }
382
383  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
384  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
385  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
386  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
387  if (Subtarget->is64Bit()) {
388    setExceptionPointerRegister(X86::RAX);
389    setExceptionSelectorRegister(X86::RDX);
390  } else {
391    setExceptionPointerRegister(X86::EAX);
392    setExceptionSelectorRegister(X86::EDX);
393  }
394  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
395  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
396
397  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
398
399  setOperationAction(ISD::TRAP, MVT::Other, Legal);
400
401  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
402  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
403  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
404  if (Subtarget->is64Bit()) {
405    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
406    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
407  } else {
408    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
409    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
410  }
411
412  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
413  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
414  if (Subtarget->is64Bit())
415    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
416  if (Subtarget->isTargetCygMing())
417    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
418  else
419    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
420
421  if (!UseSoftFloat && X86ScalarSSEf64) {
422    // f32 and f64 use SSE.
423    // Set up the FP register classes.
424    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
425    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
426
427    // Use ANDPD to simulate FABS.
428    setOperationAction(ISD::FABS , MVT::f64, Custom);
429    setOperationAction(ISD::FABS , MVT::f32, Custom);
430
431    // Use XORP to simulate FNEG.
432    setOperationAction(ISD::FNEG , MVT::f64, Custom);
433    setOperationAction(ISD::FNEG , MVT::f32, Custom);
434
435    // Use ANDPD and ORPD to simulate FCOPYSIGN.
436    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
437    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
438
439    // We don't support sin/cos/fmod
440    setOperationAction(ISD::FSIN , MVT::f64, Expand);
441    setOperationAction(ISD::FCOS , MVT::f64, Expand);
442    setOperationAction(ISD::FSIN , MVT::f32, Expand);
443    setOperationAction(ISD::FCOS , MVT::f32, Expand);
444
445    // Expand FP immediates into loads from the stack, except for the special
446    // cases we handle.
447    addLegalFPImmediate(APFloat(+0.0)); // xorpd
448    addLegalFPImmediate(APFloat(+0.0f)); // xorps
449  } else if (!UseSoftFloat && X86ScalarSSEf32) {
450    // Use SSE for f32, x87 for f64.
451    // Set up the FP register classes.
452    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
453    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
454
455    // Use ANDPS to simulate FABS.
456    setOperationAction(ISD::FABS , MVT::f32, Custom);
457
458    // Use XORP to simulate FNEG.
459    setOperationAction(ISD::FNEG , MVT::f32, Custom);
460
461    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
462
463    // Use ANDPS and ORPS to simulate FCOPYSIGN.
464    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
465    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
466
467    // We don't support sin/cos/fmod
468    setOperationAction(ISD::FSIN , MVT::f32, Expand);
469    setOperationAction(ISD::FCOS , MVT::f32, Expand);
470
471    // Special cases we handle for FP constants.
472    addLegalFPImmediate(APFloat(+0.0f)); // xorps
473    addLegalFPImmediate(APFloat(+0.0)); // FLD0
474    addLegalFPImmediate(APFloat(+1.0)); // FLD1
475    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
476    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
477
478    if (!UnsafeFPMath) {
479      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
480      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
481    }
482  } else if (!UseSoftFloat) {
483    // f32 and f64 in x87.
484    // Set up the FP register classes.
485    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
486    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
487
488    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
489    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
490    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
491    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
492
493    if (!UnsafeFPMath) {
494      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
495      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
496    }
497    addLegalFPImmediate(APFloat(+0.0)); // FLD0
498    addLegalFPImmediate(APFloat(+1.0)); // FLD1
499    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
500    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
501    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
502    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
503    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
504    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
505  }
506
507  // Long double always uses X87.
508  if (!UseSoftFloat) {
509    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
510    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
511    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
512    {
513      bool ignored;
514      APFloat TmpFlt(+0.0);
515      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
516                     &ignored);
517      addLegalFPImmediate(TmpFlt);  // FLD0
518      TmpFlt.changeSign();
519      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
520      APFloat TmpFlt2(+1.0);
521      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
522                      &ignored);
523      addLegalFPImmediate(TmpFlt2);  // FLD1
524      TmpFlt2.changeSign();
525      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
526    }
527
528    if (!UnsafeFPMath) {
529      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
530      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
531    }
532  }
533
534  // Always use a library call for pow.
535  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
536  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
537  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
538
539  setOperationAction(ISD::FLOG, MVT::f80, Expand);
540  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
541  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
542  setOperationAction(ISD::FEXP, MVT::f80, Expand);
543  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
544
545  // First set operation action for all vector types to either promote
546  // (for widening) or expand (for scalarization). Then we will selectively
547  // turn on ones that can be effectively codegen'd.
548  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
549       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
550    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
551    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
552    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
553    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
554    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
555    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
556    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
557    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
558    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
559    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
560    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
561    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
562    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
563    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
564    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
565    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
566    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
567    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
568    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
569    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
570    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
571    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
572    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
573    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
574    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
575    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
576    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
577    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
578    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
579    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
580    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
581    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
582    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
583    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
584    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
585    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
586    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
587    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
588    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
589    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
590    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
591    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
592    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
593    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
594    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
595    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
596    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
597    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
598    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
599    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
600    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
601    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
602    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
603    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
604         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
605      setTruncStoreAction((MVT::SimpleValueType)VT,
606                          (MVT::SimpleValueType)InnerVT, Expand);
607    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
608    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
609    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
610  }
611
612  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
613  // with -msoft-float, disable use of MMX as well.
614  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
615    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass, false);
616    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
617    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
618
619    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
620
621    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
622    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
623    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
624    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
625
626    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
627    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
628    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
629    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
630
631    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
632    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
633
634    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
635    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
636    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
637    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
638    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
639    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
640    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
641
642    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
643    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
644    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
645    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
646    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
647    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
648    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
649
650    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
651    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
652    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
653    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
654    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
655    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
656    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
657
658    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
659    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
660    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
661    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
662    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
663    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
664    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
665
666    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
667    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
668    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
669    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
670
671    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
672    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
673    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
674    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
675
676    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
677    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
678    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
679
680    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
681
682    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
683    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
684    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
685    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
686    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
687    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
688    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
689
690    if (!X86ScalarSSEf64 && Subtarget->is64Bit()) {
691      setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Custom);
692      setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Custom);
693      setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Custom);
694      setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Custom);
695    }
696  }
697
698  if (!UseSoftFloat && Subtarget->hasSSE1()) {
699    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
700
701    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
702    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
703    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
704    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
705    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
706    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
707    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
708    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
709    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
710    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
711    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
712    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
713  }
714
715  if (!UseSoftFloat && Subtarget->hasSSE2()) {
716    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
717
718    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
719    // registers cannot be used even for integer operations.
720    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
721    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
722    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
723    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
724
725    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
726    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
727    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
728    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
729    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
730    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
731    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
732    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
733    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
734    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
735    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
736    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
737    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
738    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
739    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
740    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
741
742    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
743    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
744    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
745    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
746
747    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
748    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
749    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
750    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
751    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
752
753    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
754    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
755    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
756    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
757    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
758
759    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
760    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
761      EVT VT = (MVT::SimpleValueType)i;
762      // Do not attempt to custom lower non-power-of-2 vectors
763      if (!isPowerOf2_32(VT.getVectorNumElements()))
764        continue;
765      // Do not attempt to custom lower non-128-bit vectors
766      if (!VT.is128BitVector())
767        continue;
768      setOperationAction(ISD::BUILD_VECTOR,
769                         VT.getSimpleVT().SimpleTy, Custom);
770      setOperationAction(ISD::VECTOR_SHUFFLE,
771                         VT.getSimpleVT().SimpleTy, Custom);
772      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
773                         VT.getSimpleVT().SimpleTy, Custom);
774    }
775
776    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
777    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
778    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
779    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
780    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
781    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
782
783    if (Subtarget->is64Bit()) {
784      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
785      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
786    }
787
788    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
789    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
790      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
791      EVT VT = SVT;
792
793      // Do not attempt to promote non-128-bit vectors
794      if (!VT.is128BitVector())
795        continue;
796
797      setOperationAction(ISD::AND,    SVT, Promote);
798      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
799      setOperationAction(ISD::OR,     SVT, Promote);
800      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
801      setOperationAction(ISD::XOR,    SVT, Promote);
802      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
803      setOperationAction(ISD::LOAD,   SVT, Promote);
804      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
805      setOperationAction(ISD::SELECT, SVT, Promote);
806      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
807    }
808
809    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
810
811    // Custom lower v2i64 and v2f64 selects.
812    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
813    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
814    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
815    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
816
817    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
818    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
819    if (!DisableMMX && Subtarget->hasMMX()) {
820      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
821      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
822    }
823  }
824
825  if (Subtarget->hasSSE41()) {
826    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
827    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
828    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
829    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
830    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
831    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
832    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
833    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
834    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
835    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
836
837    // FIXME: Do we need to handle scalar-to-vector here?
838    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
839
840    // i8 and i16 vectors are custom , because the source register and source
841    // source memory operand types are not the same width.  f32 vectors are
842    // custom since the immediate controlling the insert encodes additional
843    // information.
844    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
845    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
846    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
847    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
848
849    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
850    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
851    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
852    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
853
854    if (Subtarget->is64Bit()) {
855      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
856      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
857    }
858  }
859
860  if (Subtarget->hasSSE42()) {
861    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
862  }
863
864  if (!UseSoftFloat && Subtarget->hasAVX()) {
865    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
866    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
867    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
868    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
869
870    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
871    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
872    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
873    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
874    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
875    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
876    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
877    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
878    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
879    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
880    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
881    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
882    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
883    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
884    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
885
886    // Operations to consider commented out -v16i16 v32i8
887    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
888    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
889    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
890    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
891    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
892    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
893    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
894    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
895    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
896    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
897    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
898    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
899    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
900    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
901
902    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
903    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
904    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
905    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
906
907    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
908    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
909    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
910    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
911    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
912
913    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
914    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
915    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
916    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
917    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
918    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
919
920#if 0
921    // Not sure we want to do this since there are no 256-bit integer
922    // operations in AVX
923
924    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
925    // This includes 256-bit vectors
926    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
927      EVT VT = (MVT::SimpleValueType)i;
928
929      // Do not attempt to custom lower non-power-of-2 vectors
930      if (!isPowerOf2_32(VT.getVectorNumElements()))
931        continue;
932
933      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
934      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
935      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
936    }
937
938    if (Subtarget->is64Bit()) {
939      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
940      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
941    }
942#endif
943
944#if 0
945    // Not sure we want to do this since there are no 256-bit integer
946    // operations in AVX
947
948    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
949    // Including 256-bit vectors
950    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
951      EVT VT = (MVT::SimpleValueType)i;
952
953      if (!VT.is256BitVector()) {
954        continue;
955      }
956      setOperationAction(ISD::AND,    VT, Promote);
957      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
958      setOperationAction(ISD::OR,     VT, Promote);
959      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
960      setOperationAction(ISD::XOR,    VT, Promote);
961      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
962      setOperationAction(ISD::LOAD,   VT, Promote);
963      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
964      setOperationAction(ISD::SELECT, VT, Promote);
965      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
966    }
967
968    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
969#endif
970  }
971
972  // We want to custom lower some of our intrinsics.
973  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
974
975  // Add/Sub/Mul with overflow operations are custom lowered.
976  setOperationAction(ISD::SADDO, MVT::i32, Custom);
977  setOperationAction(ISD::UADDO, MVT::i32, Custom);
978  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
979  setOperationAction(ISD::USUBO, MVT::i32, Custom);
980  setOperationAction(ISD::SMULO, MVT::i32, Custom);
981
982  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
983  // handle type legalization for these operations here.
984  //
985  // FIXME: We really should do custom legalization for addition and
986  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
987  // than generic legalization for 64-bit multiplication-with-overflow, though.
988  if (Subtarget->is64Bit()) {
989    setOperationAction(ISD::SADDO, MVT::i64, Custom);
990    setOperationAction(ISD::UADDO, MVT::i64, Custom);
991    setOperationAction(ISD::SSUBO, MVT::i64, Custom);
992    setOperationAction(ISD::USUBO, MVT::i64, Custom);
993    setOperationAction(ISD::SMULO, MVT::i64, Custom);
994  }
995
996  if (!Subtarget->is64Bit()) {
997    // These libcalls are not available in 32-bit.
998    setLibcallName(RTLIB::SHL_I128, 0);
999    setLibcallName(RTLIB::SRL_I128, 0);
1000    setLibcallName(RTLIB::SRA_I128, 0);
1001  }
1002
1003  // We have target-specific dag combine patterns for the following nodes:
1004  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1005  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1006  setTargetDAGCombine(ISD::BUILD_VECTOR);
1007  setTargetDAGCombine(ISD::SELECT);
1008  setTargetDAGCombine(ISD::SHL);
1009  setTargetDAGCombine(ISD::SRA);
1010  setTargetDAGCombine(ISD::SRL);
1011  setTargetDAGCombine(ISD::OR);
1012  setTargetDAGCombine(ISD::STORE);
1013  setTargetDAGCombine(ISD::ZERO_EXTEND);
1014  if (Subtarget->is64Bit())
1015    setTargetDAGCombine(ISD::MUL);
1016
1017  computeRegisterProperties();
1018
1019  // FIXME: These should be based on subtarget info. Plus, the values should
1020  // be smaller when we are in optimizing for size mode.
1021  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1022  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1023  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
1024  setPrefLoopAlignment(16);
1025  benefitFromCodePlacementOpt = true;
1026}
1027
1028
1029MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1030  return MVT::i8;
1031}
1032
1033
1034/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1035/// the desired ByVal argument alignment.
1036static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
1037  if (MaxAlign == 16)
1038    return;
1039  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1040    if (VTy->getBitWidth() == 128)
1041      MaxAlign = 16;
1042  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1043    unsigned EltAlign = 0;
1044    getMaxByValAlign(ATy->getElementType(), EltAlign);
1045    if (EltAlign > MaxAlign)
1046      MaxAlign = EltAlign;
1047  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
1048    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1049      unsigned EltAlign = 0;
1050      getMaxByValAlign(STy->getElementType(i), EltAlign);
1051      if (EltAlign > MaxAlign)
1052        MaxAlign = EltAlign;
1053      if (MaxAlign == 16)
1054        break;
1055    }
1056  }
1057  return;
1058}
1059
1060/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1061/// function arguments in the caller parameter area. For X86, aggregates
1062/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1063/// are at 4-byte boundaries.
1064unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1065  if (Subtarget->is64Bit()) {
1066    // Max of 8 and alignment of type.
1067    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1068    if (TyAlign > 8)
1069      return TyAlign;
1070    return 8;
1071  }
1072
1073  unsigned Align = 4;
1074  if (Subtarget->hasSSE1())
1075    getMaxByValAlign(Ty, Align);
1076  return Align;
1077}
1078
1079/// getOptimalMemOpType - Returns the target specific optimal type for load
1080/// and store operations as a result of memset, memcpy, and memmove
1081/// lowering. If DstAlign is zero that means it's safe to destination
1082/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1083/// means there isn't a need to check it against alignment requirement,
1084/// probably because the source does not need to be loaded. If
1085/// 'NonScalarIntSafe' is true, that means it's safe to return a
1086/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1087/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1088/// constant so it does not need to be loaded.
1089/// It returns EVT::Other if the type should be determined using generic
1090/// target-independent logic.
1091EVT
1092X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1093                                       unsigned DstAlign, unsigned SrcAlign,
1094                                       bool NonScalarIntSafe,
1095                                       bool MemcpyStrSrc,
1096                                       MachineFunction &MF) const {
1097  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1098  // linux.  This is because the stack realignment code can't handle certain
1099  // cases like PR2962.  This should be removed when PR2962 is fixed.
1100  const Function *F = MF.getFunction();
1101  if (NonScalarIntSafe &&
1102      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1103    if (Size >= 16 &&
1104        (Subtarget->isUnalignedMemAccessFast() ||
1105         ((DstAlign == 0 || DstAlign >= 16) &&
1106          (SrcAlign == 0 || SrcAlign >= 16))) &&
1107        Subtarget->getStackAlignment() >= 16) {
1108      if (Subtarget->hasSSE2())
1109        return MVT::v4i32;
1110      if (Subtarget->hasSSE1())
1111        return MVT::v4f32;
1112    } else if (!MemcpyStrSrc && Size >= 8 &&
1113               !Subtarget->is64Bit() &&
1114               Subtarget->getStackAlignment() >= 8 &&
1115               Subtarget->hasSSE2()) {
1116      // Do not use f64 to lower memcpy if source is string constant. It's
1117      // better to use i32 to avoid the loads.
1118      return MVT::f64;
1119    }
1120  }
1121  if (Subtarget->is64Bit() && Size >= 8)
1122    return MVT::i64;
1123  return MVT::i32;
1124}
1125
1126/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1127/// current function.  The returned value is a member of the
1128/// MachineJumpTableInfo::JTEntryKind enum.
1129unsigned X86TargetLowering::getJumpTableEncoding() const {
1130  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1131  // symbol.
1132  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1133      Subtarget->isPICStyleGOT())
1134    return MachineJumpTableInfo::EK_Custom32;
1135
1136  // Otherwise, use the normal jump table encoding heuristics.
1137  return TargetLowering::getJumpTableEncoding();
1138}
1139
1140/// getPICBaseSymbol - Return the X86-32 PIC base.
1141MCSymbol *
1142X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
1143                                    MCContext &Ctx) const {
1144  const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
1145  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
1146                               Twine(MF->getFunctionNumber())+"$pb");
1147}
1148
1149
1150const MCExpr *
1151X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1152                                             const MachineBasicBlock *MBB,
1153                                             unsigned uid,MCContext &Ctx) const{
1154  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1155         Subtarget->isPICStyleGOT());
1156  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1157  // entries.
1158  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1159                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1160}
1161
1162/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1163/// jumptable.
1164SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1165                                                    SelectionDAG &DAG) const {
1166  if (!Subtarget->is64Bit())
1167    // This doesn't have DebugLoc associated with it, but is not really the
1168    // same as a Register.
1169    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1170  return Table;
1171}
1172
1173/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1174/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1175/// MCExpr.
1176const MCExpr *X86TargetLowering::
1177getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1178                             MCContext &Ctx) const {
1179  // X86-64 uses RIP relative addressing based on the jump table label.
1180  if (Subtarget->isPICStyleRIPRel())
1181    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1182
1183  // Otherwise, the reference is relative to the PIC base.
1184  return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
1185}
1186
1187/// getFunctionAlignment - Return the Log2 alignment of this function.
1188unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1189  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
1190}
1191
1192bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1193                                               unsigned &Offset) const {
1194  if (!Subtarget->isTargetLinux())
1195    return false;
1196
1197  if (Subtarget->is64Bit()) {
1198    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1199    Offset = 0x28;
1200    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1201      AddressSpace = 256;
1202    else
1203      AddressSpace = 257;
1204  } else {
1205    // %gs:0x14 on i386
1206    Offset = 0x14;
1207    AddressSpace = 256;
1208  }
1209  return true;
1210}
1211
1212
1213//===----------------------------------------------------------------------===//
1214//               Return Value Calling Convention Implementation
1215//===----------------------------------------------------------------------===//
1216
1217#include "X86GenCallingConv.inc"
1218
1219bool
1220X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
1221                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1222                        LLVMContext &Context) const {
1223  SmallVector<CCValAssign, 16> RVLocs;
1224  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1225                 RVLocs, Context);
1226  return CCInfo.CheckReturn(Outs, RetCC_X86);
1227}
1228
1229SDValue
1230X86TargetLowering::LowerReturn(SDValue Chain,
1231                               CallingConv::ID CallConv, bool isVarArg,
1232                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1233                               const SmallVectorImpl<SDValue> &OutVals,
1234                               DebugLoc dl, SelectionDAG &DAG) const {
1235  MachineFunction &MF = DAG.getMachineFunction();
1236  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1237
1238  SmallVector<CCValAssign, 16> RVLocs;
1239  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1240                 RVLocs, *DAG.getContext());
1241  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1242
1243  // Add the regs to the liveout set for the function.
1244  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1245  for (unsigned i = 0; i != RVLocs.size(); ++i)
1246    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1247      MRI.addLiveOut(RVLocs[i].getLocReg());
1248
1249  SDValue Flag;
1250
1251  SmallVector<SDValue, 6> RetOps;
1252  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1253  // Operand #1 = Bytes To Pop
1254  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1255                   MVT::i16));
1256
1257  // Copy the result values into the output registers.
1258  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1259    CCValAssign &VA = RVLocs[i];
1260    assert(VA.isRegLoc() && "Can only return in registers!");
1261    SDValue ValToCopy = OutVals[i];
1262
1263    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1264    // the RET instruction and handled by the FP Stackifier.
1265    if (VA.getLocReg() == X86::ST0 ||
1266        VA.getLocReg() == X86::ST1) {
1267      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1268      // change the value to the FP stack register class.
1269      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1270        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1271      RetOps.push_back(ValToCopy);
1272      // Don't emit a copytoreg.
1273      continue;
1274    }
1275
1276    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1277    // which is returned in RAX / RDX.
1278    if (Subtarget->is64Bit()) {
1279      EVT ValVT = ValToCopy.getValueType();
1280      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1281        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1282        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1283          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1284                                  ValToCopy);
1285      }
1286    }
1287
1288    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1289    Flag = Chain.getValue(1);
1290  }
1291
1292  // The x86-64 ABI for returning structs by value requires that we copy
1293  // the sret argument into %rax for the return. We saved the argument into
1294  // a virtual register in the entry block, so now we copy the value out
1295  // and into %rax.
1296  if (Subtarget->is64Bit() &&
1297      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1298    MachineFunction &MF = DAG.getMachineFunction();
1299    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1300    unsigned Reg = FuncInfo->getSRetReturnReg();
1301    assert(Reg &&
1302           "SRetReturnReg should have been set in LowerFormalArguments().");
1303    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1304
1305    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1306    Flag = Chain.getValue(1);
1307
1308    // RAX now acts like a return value.
1309    MRI.addLiveOut(X86::RAX);
1310  }
1311
1312  RetOps[0] = Chain;  // Update chain.
1313
1314  // Add the flag if we have it.
1315  if (Flag.getNode())
1316    RetOps.push_back(Flag);
1317
1318  return DAG.getNode(X86ISD::RET_FLAG, dl,
1319                     MVT::Other, &RetOps[0], RetOps.size());
1320}
1321
1322/// LowerCallResult - Lower the result values of a call into the
1323/// appropriate copies out of appropriate physical registers.
1324///
1325SDValue
1326X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1327                                   CallingConv::ID CallConv, bool isVarArg,
1328                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1329                                   DebugLoc dl, SelectionDAG &DAG,
1330                                   SmallVectorImpl<SDValue> &InVals) const {
1331
1332  // Assign locations to each value returned by this call.
1333  SmallVector<CCValAssign, 16> RVLocs;
1334  bool Is64Bit = Subtarget->is64Bit();
1335  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1336                 RVLocs, *DAG.getContext());
1337  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1338
1339  // Copy all of the result registers out of their specified physreg.
1340  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1341    CCValAssign &VA = RVLocs[i];
1342    EVT CopyVT = VA.getValVT();
1343
1344    // If this is x86-64, and we disabled SSE, we can't return FP values
1345    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1346        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1347      report_fatal_error("SSE register return with SSE disabled");
1348    }
1349
1350    SDValue Val;
1351
1352    // If this is a call to a function that returns an fp value on the floating
1353    // point stack, we must guarantee the the value is popped from the stack, so
1354    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1355    // if the return value is not used. We use the FpGET_ST0 instructions
1356    // instead.
1357    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1358      // If we prefer to use the value in xmm registers, copy it out as f80 and
1359      // use a truncate to move it from fp stack reg to xmm reg.
1360      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1361      bool isST0 = VA.getLocReg() == X86::ST0;
1362      unsigned Opc = 0;
1363      if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32;
1364      if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64;
1365      if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80;
1366      SDValue Ops[] = { Chain, InFlag };
1367      Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag,
1368                                         Ops, 2), 1);
1369      Val = Chain.getValue(0);
1370
1371      // Round the f80 to the right size, which also moves it to the appropriate
1372      // xmm register.
1373      if (CopyVT != VA.getValVT())
1374        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1375                          // This truncation won't change the value.
1376                          DAG.getIntPtrConstant(1));
1377    } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1378      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1379      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1380        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1381                                   MVT::v2i64, InFlag).getValue(1);
1382        Val = Chain.getValue(0);
1383        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1384                          Val, DAG.getConstant(0, MVT::i64));
1385      } else {
1386        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1387                                   MVT::i64, InFlag).getValue(1);
1388        Val = Chain.getValue(0);
1389      }
1390      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1391    } else {
1392      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1393                                 CopyVT, InFlag).getValue(1);
1394      Val = Chain.getValue(0);
1395    }
1396    InFlag = Chain.getValue(2);
1397    InVals.push_back(Val);
1398  }
1399
1400  return Chain;
1401}
1402
1403
1404//===----------------------------------------------------------------------===//
1405//                C & StdCall & Fast Calling Convention implementation
1406//===----------------------------------------------------------------------===//
1407//  StdCall calling convention seems to be standard for many Windows' API
1408//  routines and around. It differs from C calling convention just a little:
1409//  callee should clean up the stack, not caller. Symbols should be also
1410//  decorated in some fancy way :) It doesn't support any vector arguments.
1411//  For info on fast calling convention see Fast Calling Convention (tail call)
1412//  implementation LowerX86_32FastCCCallTo.
1413
1414/// CallIsStructReturn - Determines whether a call uses struct return
1415/// semantics.
1416static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1417  if (Outs.empty())
1418    return false;
1419
1420  return Outs[0].Flags.isSRet();
1421}
1422
1423/// ArgsAreStructReturn - Determines whether a function uses struct
1424/// return semantics.
1425static bool
1426ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1427  if (Ins.empty())
1428    return false;
1429
1430  return Ins[0].Flags.isSRet();
1431}
1432
1433/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1434/// given CallingConvention value.
1435CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1436  if (Subtarget->is64Bit()) {
1437    if (CC == CallingConv::GHC)
1438      return CC_X86_64_GHC;
1439    else if (Subtarget->isTargetWin64())
1440      return CC_X86_Win64_C;
1441    else
1442      return CC_X86_64_C;
1443  }
1444
1445  if (CC == CallingConv::X86_FastCall)
1446    return CC_X86_32_FastCall;
1447  else if (CC == CallingConv::X86_ThisCall)
1448    return CC_X86_32_ThisCall;
1449  else if (CC == CallingConv::Fast)
1450    return CC_X86_32_FastCC;
1451  else if (CC == CallingConv::GHC)
1452    return CC_X86_32_GHC;
1453  else
1454    return CC_X86_32_C;
1455}
1456
1457/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1458/// by "Src" to address "Dst" with size and alignment information specified by
1459/// the specific parameter attribute. The copy will be passed as a byval
1460/// function parameter.
1461static SDValue
1462CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1463                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1464                          DebugLoc dl) {
1465  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1466  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1467                       /*isVolatile*/false, /*AlwaysInline=*/true,
1468                       NULL, 0, NULL, 0);
1469}
1470
1471/// IsTailCallConvention - Return true if the calling convention is one that
1472/// supports tail call optimization.
1473static bool IsTailCallConvention(CallingConv::ID CC) {
1474  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1475}
1476
1477/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1478/// a tailcall target by changing its ABI.
1479static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
1480  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1481}
1482
1483SDValue
1484X86TargetLowering::LowerMemArgument(SDValue Chain,
1485                                    CallingConv::ID CallConv,
1486                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1487                                    DebugLoc dl, SelectionDAG &DAG,
1488                                    const CCValAssign &VA,
1489                                    MachineFrameInfo *MFI,
1490                                    unsigned i) const {
1491  // Create the nodes corresponding to a load from this parameter slot.
1492  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1493  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
1494  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1495  EVT ValVT;
1496
1497  // If value is passed by pointer we have address passed instead of the value
1498  // itself.
1499  if (VA.getLocInfo() == CCValAssign::Indirect)
1500    ValVT = VA.getLocVT();
1501  else
1502    ValVT = VA.getValVT();
1503
1504  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1505  // changed with more analysis.
1506  // In case of tail call optimization mark all arguments mutable. Since they
1507  // could be overwritten by lowering of arguments in case of a tail call.
1508  if (Flags.isByVal()) {
1509    int FI = MFI->CreateFixedObject(Flags.getByValSize(),
1510                                    VA.getLocMemOffset(), isImmutable);
1511    return DAG.getFrameIndex(FI, getPointerTy());
1512  } else {
1513    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1514                                    VA.getLocMemOffset(), isImmutable);
1515    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1516    return DAG.getLoad(ValVT, dl, Chain, FIN,
1517                       PseudoSourceValue::getFixedStack(FI), 0,
1518                       false, false, 0);
1519  }
1520}
1521
1522SDValue
1523X86TargetLowering::LowerFormalArguments(SDValue Chain,
1524                                        CallingConv::ID CallConv,
1525                                        bool isVarArg,
1526                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1527                                        DebugLoc dl,
1528                                        SelectionDAG &DAG,
1529                                        SmallVectorImpl<SDValue> &InVals)
1530                                          const {
1531  MachineFunction &MF = DAG.getMachineFunction();
1532  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1533
1534  const Function* Fn = MF.getFunction();
1535  if (Fn->hasExternalLinkage() &&
1536      Subtarget->isTargetCygMing() &&
1537      Fn->getName() == "main")
1538    FuncInfo->setForceFramePointer(true);
1539
1540  MachineFrameInfo *MFI = MF.getFrameInfo();
1541  bool Is64Bit = Subtarget->is64Bit();
1542  bool IsWin64 = Subtarget->isTargetWin64();
1543
1544  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1545         "Var args not supported with calling convention fastcc or ghc");
1546
1547  // Assign locations to all of the incoming arguments.
1548  SmallVector<CCValAssign, 16> ArgLocs;
1549  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1550                 ArgLocs, *DAG.getContext());
1551  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1552
1553  unsigned LastVal = ~0U;
1554  SDValue ArgValue;
1555  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1556    CCValAssign &VA = ArgLocs[i];
1557    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1558    // places.
1559    assert(VA.getValNo() != LastVal &&
1560           "Don't support value assigned to multiple locs yet");
1561    LastVal = VA.getValNo();
1562
1563    if (VA.isRegLoc()) {
1564      EVT RegVT = VA.getLocVT();
1565      TargetRegisterClass *RC = NULL;
1566      if (RegVT == MVT::i32)
1567        RC = X86::GR32RegisterClass;
1568      else if (Is64Bit && RegVT == MVT::i64)
1569        RC = X86::GR64RegisterClass;
1570      else if (RegVT == MVT::f32)
1571        RC = X86::FR32RegisterClass;
1572      else if (RegVT == MVT::f64)
1573        RC = X86::FR64RegisterClass;
1574      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1575        RC = X86::VR128RegisterClass;
1576      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1577        RC = X86::VR64RegisterClass;
1578      else
1579        llvm_unreachable("Unknown argument type!");
1580
1581      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1582      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1583
1584      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1585      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1586      // right size.
1587      if (VA.getLocInfo() == CCValAssign::SExt)
1588        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1589                               DAG.getValueType(VA.getValVT()));
1590      else if (VA.getLocInfo() == CCValAssign::ZExt)
1591        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1592                               DAG.getValueType(VA.getValVT()));
1593      else if (VA.getLocInfo() == CCValAssign::BCvt)
1594        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1595
1596      if (VA.isExtInLoc()) {
1597        // Handle MMX values passed in XMM regs.
1598        if (RegVT.isVector()) {
1599          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1600                                 ArgValue, DAG.getConstant(0, MVT::i64));
1601          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1602        } else
1603          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1604      }
1605    } else {
1606      assert(VA.isMemLoc());
1607      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1608    }
1609
1610    // If value is passed via pointer - do a load.
1611    if (VA.getLocInfo() == CCValAssign::Indirect)
1612      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
1613                             false, false, 0);
1614
1615    InVals.push_back(ArgValue);
1616  }
1617
1618  // The x86-64 ABI for returning structs by value requires that we copy
1619  // the sret argument into %rax for the return. Save the argument into
1620  // a virtual register so that we can access it from the return points.
1621  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1622    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1623    unsigned Reg = FuncInfo->getSRetReturnReg();
1624    if (!Reg) {
1625      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1626      FuncInfo->setSRetReturnReg(Reg);
1627    }
1628    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1629    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1630  }
1631
1632  unsigned StackSize = CCInfo.getNextStackOffset();
1633  // Align stack specially for tail calls.
1634  if (FuncIsMadeTailCallSafe(CallConv))
1635    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1636
1637  // If the function takes variable number of arguments, make a frame index for
1638  // the start of the first vararg value... for expansion of llvm.va_start.
1639  if (isVarArg) {
1640    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
1641                    CallConv != CallingConv::X86_ThisCall)) {
1642      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
1643    }
1644    if (Is64Bit) {
1645      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1646
1647      // FIXME: We should really autogenerate these arrays
1648      static const unsigned GPR64ArgRegsWin64[] = {
1649        X86::RCX, X86::RDX, X86::R8,  X86::R9
1650      };
1651      static const unsigned XMMArgRegsWin64[] = {
1652        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1653      };
1654      static const unsigned GPR64ArgRegs64Bit[] = {
1655        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1656      };
1657      static const unsigned XMMArgRegs64Bit[] = {
1658        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1659        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1660      };
1661      const unsigned *GPR64ArgRegs, *XMMArgRegs;
1662
1663      if (IsWin64) {
1664        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1665        GPR64ArgRegs = GPR64ArgRegsWin64;
1666        XMMArgRegs = XMMArgRegsWin64;
1667      } else {
1668        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1669        GPR64ArgRegs = GPR64ArgRegs64Bit;
1670        XMMArgRegs = XMMArgRegs64Bit;
1671      }
1672      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1673                                                       TotalNumIntRegs);
1674      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1675                                                       TotalNumXMMRegs);
1676
1677      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1678      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1679             "SSE register cannot be used when SSE is disabled!");
1680      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1681             "SSE register cannot be used when SSE is disabled!");
1682      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1683        // Kernel mode asks for SSE to be disabled, so don't push them
1684        // on the stack.
1685        TotalNumXMMRegs = 0;
1686
1687      // For X86-64, if there are vararg parameters that are passed via
1688      // registers, then we must store them to their spots on the stack so they
1689      // may be loaded by deferencing the result of va_next.
1690      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1691      FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
1692      FuncInfo->setRegSaveFrameIndex(
1693        MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
1694                               false));
1695
1696      // Store the integer parameter registers.
1697      SmallVector<SDValue, 8> MemOps;
1698      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1699                                        getPointerTy());
1700      unsigned Offset = FuncInfo->getVarArgsGPOffset();
1701      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1702        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1703                                  DAG.getIntPtrConstant(Offset));
1704        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1705                                     X86::GR64RegisterClass);
1706        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1707        SDValue Store =
1708          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1709                       PseudoSourceValue::getFixedStack(
1710                         FuncInfo->getRegSaveFrameIndex()),
1711                       Offset, false, false, 0);
1712        MemOps.push_back(Store);
1713        Offset += 8;
1714      }
1715
1716      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1717        // Now store the XMM (fp + vector) parameter registers.
1718        SmallVector<SDValue, 11> SaveXMMOps;
1719        SaveXMMOps.push_back(Chain);
1720
1721        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1722        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1723        SaveXMMOps.push_back(ALVal);
1724
1725        SaveXMMOps.push_back(DAG.getIntPtrConstant(
1726                               FuncInfo->getRegSaveFrameIndex()));
1727        SaveXMMOps.push_back(DAG.getIntPtrConstant(
1728                               FuncInfo->getVarArgsFPOffset()));
1729
1730        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1731          unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1732                                       X86::VR128RegisterClass);
1733          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1734          SaveXMMOps.push_back(Val);
1735        }
1736        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1737                                     MVT::Other,
1738                                     &SaveXMMOps[0], SaveXMMOps.size()));
1739      }
1740
1741      if (!MemOps.empty())
1742        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1743                            &MemOps[0], MemOps.size());
1744    }
1745  }
1746
1747  // Some CCs need callee pop.
1748  if (Subtarget->IsCalleePop(isVarArg, CallConv)) {
1749    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1750  } else {
1751    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1752    // If this is an sret function, the return should pop the hidden pointer.
1753    if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
1754      FuncInfo->setBytesToPopOnReturn(4);
1755  }
1756
1757  if (!Is64Bit) {
1758    // RegSaveFrameIndex is X86-64 only.
1759    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1760    if (CallConv == CallingConv::X86_FastCall ||
1761        CallConv == CallingConv::X86_ThisCall)
1762      // fastcc functions can't have varargs.
1763      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1764  }
1765
1766  return Chain;
1767}
1768
1769SDValue
1770X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1771                                    SDValue StackPtr, SDValue Arg,
1772                                    DebugLoc dl, SelectionDAG &DAG,
1773                                    const CCValAssign &VA,
1774                                    ISD::ArgFlagsTy Flags) const {
1775  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1776  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1777  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1778  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1779  if (Flags.isByVal()) {
1780    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1781  }
1782  return DAG.getStore(Chain, dl, Arg, PtrOff,
1783                      PseudoSourceValue::getStack(), LocMemOffset,
1784                      false, false, 0);
1785}
1786
1787/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1788/// optimization is performed and it is required.
1789SDValue
1790X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1791                                           SDValue &OutRetAddr, SDValue Chain,
1792                                           bool IsTailCall, bool Is64Bit,
1793                                           int FPDiff, DebugLoc dl) const {
1794  // Adjust the Return address stack slot.
1795  EVT VT = getPointerTy();
1796  OutRetAddr = getReturnAddressFrameIndex(DAG);
1797
1798  // Load the "old" Return address.
1799  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
1800  return SDValue(OutRetAddr.getNode(), 1);
1801}
1802
1803/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1804/// optimization is performed and it is required (FPDiff!=0).
1805static SDValue
1806EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1807                         SDValue Chain, SDValue RetAddrFrIdx,
1808                         bool Is64Bit, int FPDiff, DebugLoc dl) {
1809  // Store the return address to the appropriate stack slot.
1810  if (!FPDiff) return Chain;
1811  // Calculate the new stack slot for the return address.
1812  int SlotSize = Is64Bit ? 8 : 4;
1813  int NewReturnAddrFI =
1814    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
1815  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1816  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1817  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1818                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
1819                       false, false, 0);
1820  return Chain;
1821}
1822
1823SDValue
1824X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1825                             CallingConv::ID CallConv, bool isVarArg,
1826                             bool &isTailCall,
1827                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1828                             const SmallVectorImpl<SDValue> &OutVals,
1829                             const SmallVectorImpl<ISD::InputArg> &Ins,
1830                             DebugLoc dl, SelectionDAG &DAG,
1831                             SmallVectorImpl<SDValue> &InVals) const {
1832  MachineFunction &MF = DAG.getMachineFunction();
1833  bool Is64Bit        = Subtarget->is64Bit();
1834  bool IsStructRet    = CallIsStructReturn(Outs);
1835  bool IsSibcall      = false;
1836
1837  if (isTailCall) {
1838    // Check if it's really possible to do a tail call.
1839    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1840                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1841                                                   Outs, OutVals, Ins, DAG);
1842
1843    // Sibcalls are automatically detected tailcalls which do not require
1844    // ABI changes.
1845    if (!GuaranteedTailCallOpt && isTailCall)
1846      IsSibcall = true;
1847
1848    if (isTailCall)
1849      ++NumTailCalls;
1850  }
1851
1852  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1853         "Var args not supported with calling convention fastcc or ghc");
1854
1855  // Analyze operands of the call, assigning locations to each operand.
1856  SmallVector<CCValAssign, 16> ArgLocs;
1857  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1858                 ArgLocs, *DAG.getContext());
1859  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1860
1861  // Get a count of how many bytes are to be pushed on the stack.
1862  unsigned NumBytes = CCInfo.getNextStackOffset();
1863  if (IsSibcall)
1864    // This is a sibcall. The memory operands are available in caller's
1865    // own caller's stack.
1866    NumBytes = 0;
1867  else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
1868    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1869
1870  int FPDiff = 0;
1871  if (isTailCall && !IsSibcall) {
1872    // Lower arguments at fp - stackoffset + fpdiff.
1873    unsigned NumBytesCallerPushed =
1874      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1875    FPDiff = NumBytesCallerPushed - NumBytes;
1876
1877    // Set the delta of movement of the returnaddr stackslot.
1878    // But only set if delta is greater than previous delta.
1879    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1880      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1881  }
1882
1883  if (!IsSibcall)
1884    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1885
1886  SDValue RetAddrFrIdx;
1887  // Load return adress for tail calls.
1888  if (isTailCall && FPDiff)
1889    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
1890                                    Is64Bit, FPDiff, dl);
1891
1892  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1893  SmallVector<SDValue, 8> MemOpChains;
1894  SDValue StackPtr;
1895
1896  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1897  // of tail call optimization arguments are handle later.
1898  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1899    CCValAssign &VA = ArgLocs[i];
1900    EVT RegVT = VA.getLocVT();
1901    SDValue Arg = OutVals[i];
1902    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1903    bool isByVal = Flags.isByVal();
1904
1905    // Promote the value if needed.
1906    switch (VA.getLocInfo()) {
1907    default: llvm_unreachable("Unknown loc info!");
1908    case CCValAssign::Full: break;
1909    case CCValAssign::SExt:
1910      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1911      break;
1912    case CCValAssign::ZExt:
1913      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1914      break;
1915    case CCValAssign::AExt:
1916      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1917        // Special case: passing MMX values in XMM registers.
1918        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1919        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1920        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1921      } else
1922        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1923      break;
1924    case CCValAssign::BCvt:
1925      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1926      break;
1927    case CCValAssign::Indirect: {
1928      // Store the argument.
1929      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1930      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1931      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1932                           PseudoSourceValue::getFixedStack(FI), 0,
1933                           false, false, 0);
1934      Arg = SpillSlot;
1935      break;
1936    }
1937    }
1938
1939    if (VA.isRegLoc()) {
1940      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1941    } else if (!IsSibcall && (!isTailCall || isByVal)) {
1942      assert(VA.isMemLoc());
1943      if (StackPtr.getNode() == 0)
1944        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1945      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1946                                             dl, DAG, VA, Flags));
1947    }
1948  }
1949
1950  if (!MemOpChains.empty())
1951    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1952                        &MemOpChains[0], MemOpChains.size());
1953
1954  // Build a sequence of copy-to-reg nodes chained together with token chain
1955  // and flag operands which copy the outgoing args into registers.
1956  SDValue InFlag;
1957  // Tail call byval lowering might overwrite argument registers so in case of
1958  // tail call optimization the copies to registers are lowered later.
1959  if (!isTailCall)
1960    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1961      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1962                               RegsToPass[i].second, InFlag);
1963      InFlag = Chain.getValue(1);
1964    }
1965
1966  if (Subtarget->isPICStyleGOT()) {
1967    // ELF / PIC requires GOT in the EBX register before function calls via PLT
1968    // GOT pointer.
1969    if (!isTailCall) {
1970      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1971                               DAG.getNode(X86ISD::GlobalBaseReg,
1972                                           DebugLoc(), getPointerTy()),
1973                               InFlag);
1974      InFlag = Chain.getValue(1);
1975    } else {
1976      // If we are tail calling and generating PIC/GOT style code load the
1977      // address of the callee into ECX. The value in ecx is used as target of
1978      // the tail jump. This is done to circumvent the ebx/callee-saved problem
1979      // for tail calls on PIC/GOT architectures. Normally we would just put the
1980      // address of GOT into ebx and then call target@PLT. But for tail calls
1981      // ebx would be restored (since ebx is callee saved) before jumping to the
1982      // target@PLT.
1983
1984      // Note: The actual moving to ECX is done further down.
1985      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1986      if (G && !G->getGlobal()->hasHiddenVisibility() &&
1987          !G->getGlobal()->hasProtectedVisibility())
1988        Callee = LowerGlobalAddress(Callee, DAG);
1989      else if (isa<ExternalSymbolSDNode>(Callee))
1990        Callee = LowerExternalSymbol(Callee, DAG);
1991    }
1992  }
1993
1994  if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) {
1995    // From AMD64 ABI document:
1996    // For calls that may call functions that use varargs or stdargs
1997    // (prototype-less calls or calls to functions containing ellipsis (...) in
1998    // the declaration) %al is used as hidden argument to specify the number
1999    // of SSE registers used. The contents of %al do not need to match exactly
2000    // the number of registers, but must be an ubound on the number of SSE
2001    // registers used and is in the range 0 - 8 inclusive.
2002
2003    // Count the number of XMM registers allocated.
2004    static const unsigned XMMArgRegs[] = {
2005      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2006      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2007    };
2008    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2009    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2010           && "SSE registers cannot be used when SSE is disabled");
2011
2012    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
2013                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
2014    InFlag = Chain.getValue(1);
2015  }
2016
2017
2018  // For tail calls lower the arguments to the 'real' stack slot.
2019  if (isTailCall) {
2020    // Force all the incoming stack arguments to be loaded from the stack
2021    // before any new outgoing arguments are stored to the stack, because the
2022    // outgoing stack slots may alias the incoming argument stack slots, and
2023    // the alias isn't otherwise explicit. This is slightly more conservative
2024    // than necessary, because it means that each store effectively depends
2025    // on every argument instead of just those arguments it would clobber.
2026    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2027
2028    SmallVector<SDValue, 8> MemOpChains2;
2029    SDValue FIN;
2030    int FI = 0;
2031    // Do not flag preceeding copytoreg stuff together with the following stuff.
2032    InFlag = SDValue();
2033    if (GuaranteedTailCallOpt) {
2034      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2035        CCValAssign &VA = ArgLocs[i];
2036        if (VA.isRegLoc())
2037          continue;
2038        assert(VA.isMemLoc());
2039        SDValue Arg = OutVals[i];
2040        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2041        // Create frame index.
2042        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2043        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2044        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2045        FIN = DAG.getFrameIndex(FI, getPointerTy());
2046
2047        if (Flags.isByVal()) {
2048          // Copy relative to framepointer.
2049          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2050          if (StackPtr.getNode() == 0)
2051            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2052                                          getPointerTy());
2053          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2054
2055          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2056                                                           ArgChain,
2057                                                           Flags, DAG, dl));
2058        } else {
2059          // Store relative to framepointer.
2060          MemOpChains2.push_back(
2061            DAG.getStore(ArgChain, dl, Arg, FIN,
2062                         PseudoSourceValue::getFixedStack(FI), 0,
2063                         false, false, 0));
2064        }
2065      }
2066    }
2067
2068    if (!MemOpChains2.empty())
2069      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2070                          &MemOpChains2[0], MemOpChains2.size());
2071
2072    // Copy arguments to their registers.
2073    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2074      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2075                               RegsToPass[i].second, InFlag);
2076      InFlag = Chain.getValue(1);
2077    }
2078    InFlag =SDValue();
2079
2080    // Store the return address to the appropriate stack slot.
2081    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2082                                     FPDiff, dl);
2083  }
2084
2085  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2086    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2087    // In the 64-bit large code model, we have to make all calls
2088    // through a register, since the call instruction's 32-bit
2089    // pc-relative offset may not be large enough to hold the whole
2090    // address.
2091  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2092    // If the callee is a GlobalAddress node (quite common, every direct call
2093    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2094    // it.
2095
2096    // We should use extra load for direct calls to dllimported functions in
2097    // non-JIT mode.
2098    const GlobalValue *GV = G->getGlobal();
2099    if (!GV->hasDLLImportLinkage()) {
2100      unsigned char OpFlags = 0;
2101
2102      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2103      // external symbols most go through the PLT in PIC mode.  If the symbol
2104      // has hidden or protected visibility, or if it is static or local, then
2105      // we don't need to use the PLT - we can directly call it.
2106      if (Subtarget->isTargetELF() &&
2107          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2108          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2109        OpFlags = X86II::MO_PLT;
2110      } else if (Subtarget->isPICStyleStubAny() &&
2111               (GV->isDeclaration() || GV->isWeakForLinker()) &&
2112               Subtarget->getDarwinVers() < 9) {
2113        // PC-relative references to external symbols should go through $stub,
2114        // unless we're building with the leopard linker or later, which
2115        // automatically synthesizes these stubs.
2116        OpFlags = X86II::MO_DARWIN_STUB;
2117      }
2118
2119      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2120                                          G->getOffset(), OpFlags);
2121    }
2122  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2123    unsigned char OpFlags = 0;
2124
2125    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
2126    // symbols should go through the PLT.
2127    if (Subtarget->isTargetELF() &&
2128        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2129      OpFlags = X86II::MO_PLT;
2130    } else if (Subtarget->isPICStyleStubAny() &&
2131             Subtarget->getDarwinVers() < 9) {
2132      // PC-relative references to external symbols should go through $stub,
2133      // unless we're building with the leopard linker or later, which
2134      // automatically synthesizes these stubs.
2135      OpFlags = X86II::MO_DARWIN_STUB;
2136    }
2137
2138    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2139                                         OpFlags);
2140  }
2141
2142  // Returns a chain & a flag for retval copy to use.
2143  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
2144  SmallVector<SDValue, 8> Ops;
2145
2146  if (!IsSibcall && isTailCall) {
2147    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2148                           DAG.getIntPtrConstant(0, true), InFlag);
2149    InFlag = Chain.getValue(1);
2150  }
2151
2152  Ops.push_back(Chain);
2153  Ops.push_back(Callee);
2154
2155  if (isTailCall)
2156    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2157
2158  // Add argument registers to the end of the list so that they are known live
2159  // into the call.
2160  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2161    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2162                                  RegsToPass[i].second.getValueType()));
2163
2164  // Add an implicit use GOT pointer in EBX.
2165  if (!isTailCall && Subtarget->isPICStyleGOT())
2166    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2167
2168  // Add an implicit use of AL for x86 vararg functions.
2169  if (Is64Bit && isVarArg)
2170    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2171
2172  if (InFlag.getNode())
2173    Ops.push_back(InFlag);
2174
2175  if (isTailCall) {
2176    // We used to do:
2177    //// If this is the first return lowered for this function, add the regs
2178    //// to the liveout set for the function.
2179    // This isn't right, although it's probably harmless on x86; liveouts
2180    // should be computed from returns not tail calls.  Consider a void
2181    // function making a tail call to a function returning int.
2182    return DAG.getNode(X86ISD::TC_RETURN, dl,
2183                       NodeTys, &Ops[0], Ops.size());
2184  }
2185
2186  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2187  InFlag = Chain.getValue(1);
2188
2189  // Create the CALLSEQ_END node.
2190  unsigned NumBytesForCalleeToPush;
2191  if (Subtarget->IsCalleePop(isVarArg, CallConv))
2192    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2193  else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
2194    // If this is a call to a struct-return function, the callee
2195    // pops the hidden struct pointer, so we have to push it back.
2196    // This is common for Darwin/X86, Linux & Mingw32 targets.
2197    NumBytesForCalleeToPush = 4;
2198  else
2199    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2200
2201  // Returns a flag for retval copy to use.
2202  if (!IsSibcall) {
2203    Chain = DAG.getCALLSEQ_END(Chain,
2204                               DAG.getIntPtrConstant(NumBytes, true),
2205                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2206                                                     true),
2207                               InFlag);
2208    InFlag = Chain.getValue(1);
2209  }
2210
2211  // Handle result values, copying them out of physregs into vregs that we
2212  // return.
2213  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2214                         Ins, dl, DAG, InVals);
2215}
2216
2217
2218//===----------------------------------------------------------------------===//
2219//                Fast Calling Convention (tail call) implementation
2220//===----------------------------------------------------------------------===//
2221
2222//  Like std call, callee cleans arguments, convention except that ECX is
2223//  reserved for storing the tail called function address. Only 2 registers are
2224//  free for argument passing (inreg). Tail call optimization is performed
2225//  provided:
2226//                * tailcallopt is enabled
2227//                * caller/callee are fastcc
2228//  On X86_64 architecture with GOT-style position independent code only local
2229//  (within module) calls are supported at the moment.
2230//  To keep the stack aligned according to platform abi the function
2231//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2232//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2233//  If a tail called function callee has more arguments than the caller the
2234//  caller needs to make sure that there is room to move the RETADDR to. This is
2235//  achieved by reserving an area the size of the argument delta right after the
2236//  original REtADDR, but before the saved framepointer or the spilled registers
2237//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2238//  stack layout:
2239//    arg1
2240//    arg2
2241//    RETADDR
2242//    [ new RETADDR
2243//      move area ]
2244//    (possible EBP)
2245//    ESI
2246//    EDI
2247//    local1 ..
2248
2249/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2250/// for a 16 byte align requirement.
2251unsigned
2252X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2253                                               SelectionDAG& DAG) const {
2254  MachineFunction &MF = DAG.getMachineFunction();
2255  const TargetMachine &TM = MF.getTarget();
2256  const TargetFrameInfo &TFI = *TM.getFrameInfo();
2257  unsigned StackAlignment = TFI.getStackAlignment();
2258  uint64_t AlignMask = StackAlignment - 1;
2259  int64_t Offset = StackSize;
2260  uint64_t SlotSize = TD->getPointerSize();
2261  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2262    // Number smaller than 12 so just add the difference.
2263    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2264  } else {
2265    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2266    Offset = ((~AlignMask) & Offset) + StackAlignment +
2267      (StackAlignment-SlotSize);
2268  }
2269  return Offset;
2270}
2271
2272/// MatchingStackOffset - Return true if the given stack call argument is
2273/// already available in the same position (relatively) of the caller's
2274/// incoming argument stack.
2275static
2276bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2277                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2278                         const X86InstrInfo *TII) {
2279  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2280  int FI = INT_MAX;
2281  if (Arg.getOpcode() == ISD::CopyFromReg) {
2282    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2283    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
2284      return false;
2285    MachineInstr *Def = MRI->getVRegDef(VR);
2286    if (!Def)
2287      return false;
2288    if (!Flags.isByVal()) {
2289      if (!TII->isLoadFromStackSlot(Def, FI))
2290        return false;
2291    } else {
2292      unsigned Opcode = Def->getOpcode();
2293      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2294          Def->getOperand(1).isFI()) {
2295        FI = Def->getOperand(1).getIndex();
2296        Bytes = Flags.getByValSize();
2297      } else
2298        return false;
2299    }
2300  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2301    if (Flags.isByVal())
2302      // ByVal argument is passed in as a pointer but it's now being
2303      // dereferenced. e.g.
2304      // define @foo(%struct.X* %A) {
2305      //   tail call @bar(%struct.X* byval %A)
2306      // }
2307      return false;
2308    SDValue Ptr = Ld->getBasePtr();
2309    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2310    if (!FINode)
2311      return false;
2312    FI = FINode->getIndex();
2313  } else
2314    return false;
2315
2316  assert(FI != INT_MAX);
2317  if (!MFI->isFixedObjectIndex(FI))
2318    return false;
2319  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2320}
2321
2322/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2323/// for tail call optimization. Targets which want to do tail call
2324/// optimization should implement this function.
2325bool
2326X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2327                                                     CallingConv::ID CalleeCC,
2328                                                     bool isVarArg,
2329                                                     bool isCalleeStructRet,
2330                                                     bool isCallerStructRet,
2331                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2332                                    const SmallVectorImpl<SDValue> &OutVals,
2333                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2334                                                     SelectionDAG& DAG) const {
2335  if (!IsTailCallConvention(CalleeCC) &&
2336      CalleeCC != CallingConv::C)
2337    return false;
2338
2339  // If -tailcallopt is specified, make fastcc functions tail-callable.
2340  const MachineFunction &MF = DAG.getMachineFunction();
2341  const Function *CallerF = DAG.getMachineFunction().getFunction();
2342  CallingConv::ID CallerCC = CallerF->getCallingConv();
2343  bool CCMatch = CallerCC == CalleeCC;
2344
2345  if (GuaranteedTailCallOpt) {
2346    if (IsTailCallConvention(CalleeCC) && CCMatch)
2347      return true;
2348    return false;
2349  }
2350
2351  // Look for obvious safe cases to perform tail call optimization that do not
2352  // require ABI changes. This is what gcc calls sibcall.
2353
2354  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2355  // emit a special epilogue.
2356  if (RegInfo->needsStackRealignment(MF))
2357    return false;
2358
2359  // Do not sibcall optimize vararg calls unless the call site is not passing
2360  // any arguments.
2361  if (isVarArg && !Outs.empty())
2362    return false;
2363
2364  // Also avoid sibcall optimization if either caller or callee uses struct
2365  // return semantics.
2366  if (isCalleeStructRet || isCallerStructRet)
2367    return false;
2368
2369  // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
2370  // Therefore if it's not used by the call it is not safe to optimize this into
2371  // a sibcall.
2372  bool Unused = false;
2373  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2374    if (!Ins[i].Used) {
2375      Unused = true;
2376      break;
2377    }
2378  }
2379  if (Unused) {
2380    SmallVector<CCValAssign, 16> RVLocs;
2381    CCState CCInfo(CalleeCC, false, getTargetMachine(),
2382                   RVLocs, *DAG.getContext());
2383    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2384    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2385      CCValAssign &VA = RVLocs[i];
2386      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2387        return false;
2388    }
2389  }
2390
2391  // If the calling conventions do not match, then we'd better make sure the
2392  // results are returned in the same way as what the caller expects.
2393  if (!CCMatch) {
2394    SmallVector<CCValAssign, 16> RVLocs1;
2395    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
2396                    RVLocs1, *DAG.getContext());
2397    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2398
2399    SmallVector<CCValAssign, 16> RVLocs2;
2400    CCState CCInfo2(CallerCC, false, getTargetMachine(),
2401                    RVLocs2, *DAG.getContext());
2402    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2403
2404    if (RVLocs1.size() != RVLocs2.size())
2405      return false;
2406    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2407      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2408        return false;
2409      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2410        return false;
2411      if (RVLocs1[i].isRegLoc()) {
2412        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2413          return false;
2414      } else {
2415        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2416          return false;
2417      }
2418    }
2419  }
2420
2421  // If the callee takes no arguments then go on to check the results of the
2422  // call.
2423  if (!Outs.empty()) {
2424    // Check if stack adjustment is needed. For now, do not do this if any
2425    // argument is passed on the stack.
2426    SmallVector<CCValAssign, 16> ArgLocs;
2427    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
2428                   ArgLocs, *DAG.getContext());
2429    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
2430    if (CCInfo.getNextStackOffset()) {
2431      MachineFunction &MF = DAG.getMachineFunction();
2432      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2433        return false;
2434      if (Subtarget->isTargetWin64())
2435        // Win64 ABI has additional complications.
2436        return false;
2437
2438      // Check if the arguments are already laid out in the right way as
2439      // the caller's fixed stack objects.
2440      MachineFrameInfo *MFI = MF.getFrameInfo();
2441      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2442      const X86InstrInfo *TII =
2443        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2444      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2445        CCValAssign &VA = ArgLocs[i];
2446        SDValue Arg = OutVals[i];
2447        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2448        if (VA.getLocInfo() == CCValAssign::Indirect)
2449          return false;
2450        if (!VA.isRegLoc()) {
2451          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2452                                   MFI, MRI, TII))
2453            return false;
2454        }
2455      }
2456    }
2457
2458    // If the tailcall address may be in a register, then make sure it's
2459    // possible to register allocate for it. In 32-bit, the call address can
2460    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2461    // callee-saved registers are restored. These happen to be the same
2462    // registers used to pass 'inreg' arguments so watch out for those.
2463    if (!Subtarget->is64Bit() &&
2464        !isa<GlobalAddressSDNode>(Callee) &&
2465        !isa<ExternalSymbolSDNode>(Callee)) {
2466      unsigned NumInRegs = 0;
2467      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2468        CCValAssign &VA = ArgLocs[i];
2469        if (!VA.isRegLoc())
2470          continue;
2471        unsigned Reg = VA.getLocReg();
2472        switch (Reg) {
2473        default: break;
2474        case X86::EAX: case X86::EDX: case X86::ECX:
2475          if (++NumInRegs == 3)
2476            return false;
2477          break;
2478        }
2479      }
2480    }
2481  }
2482
2483  return true;
2484}
2485
2486FastISel *
2487X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
2488  return X86::createFastISel(funcInfo);
2489}
2490
2491
2492//===----------------------------------------------------------------------===//
2493//                           Other Lowering Hooks
2494//===----------------------------------------------------------------------===//
2495
2496
2497SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2498  MachineFunction &MF = DAG.getMachineFunction();
2499  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2500  int ReturnAddrIndex = FuncInfo->getRAIndex();
2501
2502  if (ReturnAddrIndex == 0) {
2503    // Set up a frame object for the return address.
2504    uint64_t SlotSize = TD->getPointerSize();
2505    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2506                                                           false);
2507    FuncInfo->setRAIndex(ReturnAddrIndex);
2508  }
2509
2510  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2511}
2512
2513
2514bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2515                                       bool hasSymbolicDisplacement) {
2516  // Offset should fit into 32 bit immediate field.
2517  if (!isInt<32>(Offset))
2518    return false;
2519
2520  // If we don't have a symbolic displacement - we don't have any extra
2521  // restrictions.
2522  if (!hasSymbolicDisplacement)
2523    return true;
2524
2525  // FIXME: Some tweaks might be needed for medium code model.
2526  if (M != CodeModel::Small && M != CodeModel::Kernel)
2527    return false;
2528
2529  // For small code model we assume that latest object is 16MB before end of 31
2530  // bits boundary. We may also accept pretty large negative constants knowing
2531  // that all objects are in the positive half of address space.
2532  if (M == CodeModel::Small && Offset < 16*1024*1024)
2533    return true;
2534
2535  // For kernel code model we know that all object resist in the negative half
2536  // of 32bits address space. We may not accept negative offsets, since they may
2537  // be just off and we may accept pretty large positive ones.
2538  if (M == CodeModel::Kernel && Offset > 0)
2539    return true;
2540
2541  return false;
2542}
2543
2544/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2545/// specific condition code, returning the condition code and the LHS/RHS of the
2546/// comparison to make.
2547static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2548                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2549  if (!isFP) {
2550    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2551      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2552        // X > -1   -> X == 0, jump !sign.
2553        RHS = DAG.getConstant(0, RHS.getValueType());
2554        return X86::COND_NS;
2555      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2556        // X < 0   -> X == 0, jump on sign.
2557        return X86::COND_S;
2558      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2559        // X < 1   -> X <= 0
2560        RHS = DAG.getConstant(0, RHS.getValueType());
2561        return X86::COND_LE;
2562      }
2563    }
2564
2565    switch (SetCCOpcode) {
2566    default: llvm_unreachable("Invalid integer condition!");
2567    case ISD::SETEQ:  return X86::COND_E;
2568    case ISD::SETGT:  return X86::COND_G;
2569    case ISD::SETGE:  return X86::COND_GE;
2570    case ISD::SETLT:  return X86::COND_L;
2571    case ISD::SETLE:  return X86::COND_LE;
2572    case ISD::SETNE:  return X86::COND_NE;
2573    case ISD::SETULT: return X86::COND_B;
2574    case ISD::SETUGT: return X86::COND_A;
2575    case ISD::SETULE: return X86::COND_BE;
2576    case ISD::SETUGE: return X86::COND_AE;
2577    }
2578  }
2579
2580  // First determine if it is required or is profitable to flip the operands.
2581
2582  // If LHS is a foldable load, but RHS is not, flip the condition.
2583  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2584      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2585    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2586    std::swap(LHS, RHS);
2587  }
2588
2589  switch (SetCCOpcode) {
2590  default: break;
2591  case ISD::SETOLT:
2592  case ISD::SETOLE:
2593  case ISD::SETUGT:
2594  case ISD::SETUGE:
2595    std::swap(LHS, RHS);
2596    break;
2597  }
2598
2599  // On a floating point condition, the flags are set as follows:
2600  // ZF  PF  CF   op
2601  //  0 | 0 | 0 | X > Y
2602  //  0 | 0 | 1 | X < Y
2603  //  1 | 0 | 0 | X == Y
2604  //  1 | 1 | 1 | unordered
2605  switch (SetCCOpcode) {
2606  default: llvm_unreachable("Condcode should be pre-legalized away");
2607  case ISD::SETUEQ:
2608  case ISD::SETEQ:   return X86::COND_E;
2609  case ISD::SETOLT:              // flipped
2610  case ISD::SETOGT:
2611  case ISD::SETGT:   return X86::COND_A;
2612  case ISD::SETOLE:              // flipped
2613  case ISD::SETOGE:
2614  case ISD::SETGE:   return X86::COND_AE;
2615  case ISD::SETUGT:              // flipped
2616  case ISD::SETULT:
2617  case ISD::SETLT:   return X86::COND_B;
2618  case ISD::SETUGE:              // flipped
2619  case ISD::SETULE:
2620  case ISD::SETLE:   return X86::COND_BE;
2621  case ISD::SETONE:
2622  case ISD::SETNE:   return X86::COND_NE;
2623  case ISD::SETUO:   return X86::COND_P;
2624  case ISD::SETO:    return X86::COND_NP;
2625  case ISD::SETOEQ:
2626  case ISD::SETUNE:  return X86::COND_INVALID;
2627  }
2628}
2629
2630/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2631/// code. Current x86 isa includes the following FP cmov instructions:
2632/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2633static bool hasFPCMov(unsigned X86CC) {
2634  switch (X86CC) {
2635  default:
2636    return false;
2637  case X86::COND_B:
2638  case X86::COND_BE:
2639  case X86::COND_E:
2640  case X86::COND_P:
2641  case X86::COND_A:
2642  case X86::COND_AE:
2643  case X86::COND_NE:
2644  case X86::COND_NP:
2645    return true;
2646  }
2647}
2648
2649/// isFPImmLegal - Returns true if the target can instruction select the
2650/// specified FP immediate natively. If false, the legalizer will
2651/// materialize the FP immediate as a load from a constant pool.
2652bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
2653  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
2654    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
2655      return true;
2656  }
2657  return false;
2658}
2659
2660/// isUndefOrInRange - Return true if Val is undef or if its value falls within
2661/// the specified range (L, H].
2662static bool isUndefOrInRange(int Val, int Low, int Hi) {
2663  return (Val < 0) || (Val >= Low && Val < Hi);
2664}
2665
2666/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2667/// specified value.
2668static bool isUndefOrEqual(int Val, int CmpVal) {
2669  if (Val < 0 || Val == CmpVal)
2670    return true;
2671  return false;
2672}
2673
2674/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2675/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
2676/// the second operand.
2677static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2678  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2679    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2680  if (VT == MVT::v2f64 || VT == MVT::v2i64)
2681    return (Mask[0] < 2 && Mask[1] < 2);
2682  return false;
2683}
2684
2685bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2686  SmallVector<int, 8> M;
2687  N->getMask(M);
2688  return ::isPSHUFDMask(M, N->getValueType(0));
2689}
2690
2691/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2692/// is suitable for input to PSHUFHW.
2693static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2694  if (VT != MVT::v8i16)
2695    return false;
2696
2697  // Lower quadword copied in order or undef.
2698  for (int i = 0; i != 4; ++i)
2699    if (Mask[i] >= 0 && Mask[i] != i)
2700      return false;
2701
2702  // Upper quadword shuffled.
2703  for (int i = 4; i != 8; ++i)
2704    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2705      return false;
2706
2707  return true;
2708}
2709
2710bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2711  SmallVector<int, 8> M;
2712  N->getMask(M);
2713  return ::isPSHUFHWMask(M, N->getValueType(0));
2714}
2715
2716/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2717/// is suitable for input to PSHUFLW.
2718static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2719  if (VT != MVT::v8i16)
2720    return false;
2721
2722  // Upper quadword copied in order.
2723  for (int i = 4; i != 8; ++i)
2724    if (Mask[i] >= 0 && Mask[i] != i)
2725      return false;
2726
2727  // Lower quadword shuffled.
2728  for (int i = 0; i != 4; ++i)
2729    if (Mask[i] >= 4)
2730      return false;
2731
2732  return true;
2733}
2734
2735bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2736  SmallVector<int, 8> M;
2737  N->getMask(M);
2738  return ::isPSHUFLWMask(M, N->getValueType(0));
2739}
2740
2741/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
2742/// is suitable for input to PALIGNR.
2743static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
2744                          bool hasSSSE3) {
2745  int i, e = VT.getVectorNumElements();
2746
2747  // Do not handle v2i64 / v2f64 shuffles with palignr.
2748  if (e < 4 || !hasSSSE3)
2749    return false;
2750
2751  for (i = 0; i != e; ++i)
2752    if (Mask[i] >= 0)
2753      break;
2754
2755  // All undef, not a palignr.
2756  if (i == e)
2757    return false;
2758
2759  // Determine if it's ok to perform a palignr with only the LHS, since we
2760  // don't have access to the actual shuffle elements to see if RHS is undef.
2761  bool Unary = Mask[i] < (int)e;
2762  bool NeedsUnary = false;
2763
2764  int s = Mask[i] - i;
2765
2766  // Check the rest of the elements to see if they are consecutive.
2767  for (++i; i != e; ++i) {
2768    int m = Mask[i];
2769    if (m < 0)
2770      continue;
2771
2772    Unary = Unary && (m < (int)e);
2773    NeedsUnary = NeedsUnary || (m < s);
2774
2775    if (NeedsUnary && !Unary)
2776      return false;
2777    if (Unary && m != ((s+i) & (e-1)))
2778      return false;
2779    if (!Unary && m != (s+i))
2780      return false;
2781  }
2782  return true;
2783}
2784
2785bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
2786  SmallVector<int, 8> M;
2787  N->getMask(M);
2788  return ::isPALIGNRMask(M, N->getValueType(0), true);
2789}
2790
2791/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2792/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2793static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2794  int NumElems = VT.getVectorNumElements();
2795  if (NumElems != 2 && NumElems != 4)
2796    return false;
2797
2798  int Half = NumElems / 2;
2799  for (int i = 0; i < Half; ++i)
2800    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2801      return false;
2802  for (int i = Half; i < NumElems; ++i)
2803    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2804      return false;
2805
2806  return true;
2807}
2808
2809bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2810  SmallVector<int, 8> M;
2811  N->getMask(M);
2812  return ::isSHUFPMask(M, N->getValueType(0));
2813}
2814
2815/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2816/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2817/// half elements to come from vector 1 (which would equal the dest.) and
2818/// the upper half to come from vector 2.
2819static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2820  int NumElems = VT.getVectorNumElements();
2821
2822  if (NumElems != 2 && NumElems != 4)
2823    return false;
2824
2825  int Half = NumElems / 2;
2826  for (int i = 0; i < Half; ++i)
2827    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2828      return false;
2829  for (int i = Half; i < NumElems; ++i)
2830    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2831      return false;
2832  return true;
2833}
2834
2835static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2836  SmallVector<int, 8> M;
2837  N->getMask(M);
2838  return isCommutedSHUFPMask(M, N->getValueType(0));
2839}
2840
2841/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2842/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2843bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2844  if (N->getValueType(0).getVectorNumElements() != 4)
2845    return false;
2846
2847  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2848  return isUndefOrEqual(N->getMaskElt(0), 6) &&
2849         isUndefOrEqual(N->getMaskElt(1), 7) &&
2850         isUndefOrEqual(N->getMaskElt(2), 2) &&
2851         isUndefOrEqual(N->getMaskElt(3), 3);
2852}
2853
2854/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2855/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2856/// <2, 3, 2, 3>
2857bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2858  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2859
2860  if (NumElems != 4)
2861    return false;
2862
2863  return isUndefOrEqual(N->getMaskElt(0), 2) &&
2864  isUndefOrEqual(N->getMaskElt(1), 3) &&
2865  isUndefOrEqual(N->getMaskElt(2), 2) &&
2866  isUndefOrEqual(N->getMaskElt(3), 3);
2867}
2868
2869/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2870/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2871bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2872  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2873
2874  if (NumElems != 2 && NumElems != 4)
2875    return false;
2876
2877  for (unsigned i = 0; i < NumElems/2; ++i)
2878    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2879      return false;
2880
2881  for (unsigned i = NumElems/2; i < NumElems; ++i)
2882    if (!isUndefOrEqual(N->getMaskElt(i), i))
2883      return false;
2884
2885  return true;
2886}
2887
2888/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
2889/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
2890bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
2891  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2892
2893  if (NumElems != 2 && NumElems != 4)
2894    return false;
2895
2896  for (unsigned i = 0; i < NumElems/2; ++i)
2897    if (!isUndefOrEqual(N->getMaskElt(i), i))
2898      return false;
2899
2900  for (unsigned i = 0; i < NumElems/2; ++i)
2901    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2902      return false;
2903
2904  return true;
2905}
2906
2907/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2908/// specifies a shuffle of elements that is suitable for input to UNPCKL.
2909static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2910                         bool V2IsSplat = false) {
2911  int NumElts = VT.getVectorNumElements();
2912  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2913    return false;
2914
2915  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2916    int BitI  = Mask[i];
2917    int BitI1 = Mask[i+1];
2918    if (!isUndefOrEqual(BitI, j))
2919      return false;
2920    if (V2IsSplat) {
2921      if (!isUndefOrEqual(BitI1, NumElts))
2922        return false;
2923    } else {
2924      if (!isUndefOrEqual(BitI1, j + NumElts))
2925        return false;
2926    }
2927  }
2928  return true;
2929}
2930
2931bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2932  SmallVector<int, 8> M;
2933  N->getMask(M);
2934  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2935}
2936
2937/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2938/// specifies a shuffle of elements that is suitable for input to UNPCKH.
2939static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
2940                         bool V2IsSplat = false) {
2941  int NumElts = VT.getVectorNumElements();
2942  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2943    return false;
2944
2945  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2946    int BitI  = Mask[i];
2947    int BitI1 = Mask[i+1];
2948    if (!isUndefOrEqual(BitI, j + NumElts/2))
2949      return false;
2950    if (V2IsSplat) {
2951      if (isUndefOrEqual(BitI1, NumElts))
2952        return false;
2953    } else {
2954      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2955        return false;
2956    }
2957  }
2958  return true;
2959}
2960
2961bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2962  SmallVector<int, 8> M;
2963  N->getMask(M);
2964  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2965}
2966
2967/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2968/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2969/// <0, 0, 1, 1>
2970static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2971  int NumElems = VT.getVectorNumElements();
2972  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2973    return false;
2974
2975  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2976    int BitI  = Mask[i];
2977    int BitI1 = Mask[i+1];
2978    if (!isUndefOrEqual(BitI, j))
2979      return false;
2980    if (!isUndefOrEqual(BitI1, j))
2981      return false;
2982  }
2983  return true;
2984}
2985
2986bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2987  SmallVector<int, 8> M;
2988  N->getMask(M);
2989  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2990}
2991
2992/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2993/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2994/// <2, 2, 3, 3>
2995static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2996  int NumElems = VT.getVectorNumElements();
2997  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2998    return false;
2999
3000  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
3001    int BitI  = Mask[i];
3002    int BitI1 = Mask[i+1];
3003    if (!isUndefOrEqual(BitI, j))
3004      return false;
3005    if (!isUndefOrEqual(BitI1, j))
3006      return false;
3007  }
3008  return true;
3009}
3010
3011bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
3012  SmallVector<int, 8> M;
3013  N->getMask(M);
3014  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
3015}
3016
3017/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3018/// specifies a shuffle of elements that is suitable for input to MOVSS,
3019/// MOVSD, and MOVD, i.e. setting the lowest element.
3020static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3021  if (VT.getVectorElementType().getSizeInBits() < 32)
3022    return false;
3023
3024  int NumElts = VT.getVectorNumElements();
3025
3026  if (!isUndefOrEqual(Mask[0], NumElts))
3027    return false;
3028
3029  for (int i = 1; i < NumElts; ++i)
3030    if (!isUndefOrEqual(Mask[i], i))
3031      return false;
3032
3033  return true;
3034}
3035
3036bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
3037  SmallVector<int, 8> M;
3038  N->getMask(M);
3039  return ::isMOVLMask(M, N->getValueType(0));
3040}
3041
3042/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
3043/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3044/// element of vector 2 and the other elements to come from vector 1 in order.
3045static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
3046                               bool V2IsSplat = false, bool V2IsUndef = false) {
3047  int NumOps = VT.getVectorNumElements();
3048  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3049    return false;
3050
3051  if (!isUndefOrEqual(Mask[0], 0))
3052    return false;
3053
3054  for (int i = 1; i < NumOps; ++i)
3055    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3056          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3057          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3058      return false;
3059
3060  return true;
3061}
3062
3063static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
3064                           bool V2IsUndef = false) {
3065  SmallVector<int, 8> M;
3066  N->getMask(M);
3067  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
3068}
3069
3070/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3071/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3072bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
3073  if (N->getValueType(0).getVectorNumElements() != 4)
3074    return false;
3075
3076  // Expect 1, 1, 3, 3
3077  for (unsigned i = 0; i < 2; ++i) {
3078    int Elt = N->getMaskElt(i);
3079    if (Elt >= 0 && Elt != 1)
3080      return false;
3081  }
3082
3083  bool HasHi = false;
3084  for (unsigned i = 2; i < 4; ++i) {
3085    int Elt = N->getMaskElt(i);
3086    if (Elt >= 0 && Elt != 3)
3087      return false;
3088    if (Elt == 3)
3089      HasHi = true;
3090  }
3091  // Don't use movshdup if it can be done with a shufps.
3092  // FIXME: verify that matching u, u, 3, 3 is what we want.
3093  return HasHi;
3094}
3095
3096/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3097/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3098bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
3099  if (N->getValueType(0).getVectorNumElements() != 4)
3100    return false;
3101
3102  // Expect 0, 0, 2, 2
3103  for (unsigned i = 0; i < 2; ++i)
3104    if (N->getMaskElt(i) > 0)
3105      return false;
3106
3107  bool HasHi = false;
3108  for (unsigned i = 2; i < 4; ++i) {
3109    int Elt = N->getMaskElt(i);
3110    if (Elt >= 0 && Elt != 2)
3111      return false;
3112    if (Elt == 2)
3113      HasHi = true;
3114  }
3115  // Don't use movsldup if it can be done with a shufps.
3116  return HasHi;
3117}
3118
3119/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3120/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
3121bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
3122  int e = N->getValueType(0).getVectorNumElements() / 2;
3123
3124  for (int i = 0; i < e; ++i)
3125    if (!isUndefOrEqual(N->getMaskElt(i), i))
3126      return false;
3127  for (int i = 0; i < e; ++i)
3128    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
3129      return false;
3130  return true;
3131}
3132
3133/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3134/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3135unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
3136  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3137  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
3138
3139  unsigned Shift = (NumOperands == 4) ? 2 : 1;
3140  unsigned Mask = 0;
3141  for (int i = 0; i < NumOperands; ++i) {
3142    int Val = SVOp->getMaskElt(NumOperands-i-1);
3143    if (Val < 0) Val = 0;
3144    if (Val >= NumOperands) Val -= NumOperands;
3145    Mask |= Val;
3146    if (i != NumOperands - 1)
3147      Mask <<= Shift;
3148  }
3149  return Mask;
3150}
3151
3152/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3153/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3154unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
3155  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3156  unsigned Mask = 0;
3157  // 8 nodes, but we only care about the last 4.
3158  for (unsigned i = 7; i >= 4; --i) {
3159    int Val = SVOp->getMaskElt(i);
3160    if (Val >= 0)
3161      Mask |= (Val - 4);
3162    if (i != 4)
3163      Mask <<= 2;
3164  }
3165  return Mask;
3166}
3167
3168/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3169/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3170unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
3171  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3172  unsigned Mask = 0;
3173  // 8 nodes, but we only care about the first 4.
3174  for (int i = 3; i >= 0; --i) {
3175    int Val = SVOp->getMaskElt(i);
3176    if (Val >= 0)
3177      Mask |= Val;
3178    if (i != 0)
3179      Mask <<= 2;
3180  }
3181  return Mask;
3182}
3183
3184/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3185/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3186unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
3187  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3188  EVT VVT = N->getValueType(0);
3189  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
3190  int Val = 0;
3191
3192  unsigned i, e;
3193  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
3194    Val = SVOp->getMaskElt(i);
3195    if (Val >= 0)
3196      break;
3197  }
3198  return (Val - i) * EltSize;
3199}
3200
3201/// isZeroNode - Returns true if Elt is a constant zero or a floating point
3202/// constant +0.0.
3203bool X86::isZeroNode(SDValue Elt) {
3204  return ((isa<ConstantSDNode>(Elt) &&
3205           cast<ConstantSDNode>(Elt)->isNullValue()) ||
3206          (isa<ConstantFPSDNode>(Elt) &&
3207           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
3208}
3209
3210/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
3211/// their permute mask.
3212static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
3213                                    SelectionDAG &DAG) {
3214  EVT VT = SVOp->getValueType(0);
3215  unsigned NumElems = VT.getVectorNumElements();
3216  SmallVector<int, 8> MaskVec;
3217
3218  for (unsigned i = 0; i != NumElems; ++i) {
3219    int idx = SVOp->getMaskElt(i);
3220    if (idx < 0)
3221      MaskVec.push_back(idx);
3222    else if (idx < (int)NumElems)
3223      MaskVec.push_back(idx + NumElems);
3224    else
3225      MaskVec.push_back(idx - NumElems);
3226  }
3227  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
3228                              SVOp->getOperand(0), &MaskVec[0]);
3229}
3230
3231/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3232/// the two vector operands have swapped position.
3233static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
3234  unsigned NumElems = VT.getVectorNumElements();
3235  for (unsigned i = 0; i != NumElems; ++i) {
3236    int idx = Mask[i];
3237    if (idx < 0)
3238      continue;
3239    else if (idx < (int)NumElems)
3240      Mask[i] = idx + NumElems;
3241    else
3242      Mask[i] = idx - NumElems;
3243  }
3244}
3245
3246/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
3247/// match movhlps. The lower half elements should come from upper half of
3248/// V1 (and in order), and the upper half elements should come from the upper
3249/// half of V2 (and in order).
3250static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
3251  if (Op->getValueType(0).getVectorNumElements() != 4)
3252    return false;
3253  for (unsigned i = 0, e = 2; i != e; ++i)
3254    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
3255      return false;
3256  for (unsigned i = 2; i != 4; ++i)
3257    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
3258      return false;
3259  return true;
3260}
3261
3262/// isScalarLoadToVector - Returns true if the node is a scalar load that
3263/// is promoted to a vector. It also returns the LoadSDNode by reference if
3264/// required.
3265static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
3266  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
3267    return false;
3268  N = N->getOperand(0).getNode();
3269  if (!ISD::isNON_EXTLoad(N))
3270    return false;
3271  if (LD)
3272    *LD = cast<LoadSDNode>(N);
3273  return true;
3274}
3275
3276/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
3277/// match movlp{s|d}. The lower half elements should come from lower half of
3278/// V1 (and in order), and the upper half elements should come from the upper
3279/// half of V2 (and in order). And since V1 will become the source of the
3280/// MOVLP, it must be either a vector load or a scalar load to vector.
3281static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
3282                               ShuffleVectorSDNode *Op) {
3283  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
3284    return false;
3285  // Is V2 is a vector load, don't do this transformation. We will try to use
3286  // load folding shufps op.
3287  if (ISD::isNON_EXTLoad(V2))
3288    return false;
3289
3290  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
3291
3292  if (NumElems != 2 && NumElems != 4)
3293    return false;
3294  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3295    if (!isUndefOrEqual(Op->getMaskElt(i), i))
3296      return false;
3297  for (unsigned i = NumElems/2; i != NumElems; ++i)
3298    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
3299      return false;
3300  return true;
3301}
3302
3303/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
3304/// all the same.
3305static bool isSplatVector(SDNode *N) {
3306  if (N->getOpcode() != ISD::BUILD_VECTOR)
3307    return false;
3308
3309  SDValue SplatValue = N->getOperand(0);
3310  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
3311    if (N->getOperand(i) != SplatValue)
3312      return false;
3313  return true;
3314}
3315
3316/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
3317/// to an zero vector.
3318/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
3319static bool isZeroShuffle(ShuffleVectorSDNode *N) {
3320  SDValue V1 = N->getOperand(0);
3321  SDValue V2 = N->getOperand(1);
3322  unsigned NumElems = N->getValueType(0).getVectorNumElements();
3323  for (unsigned i = 0; i != NumElems; ++i) {
3324    int Idx = N->getMaskElt(i);
3325    if (Idx >= (int)NumElems) {
3326      unsigned Opc = V2.getOpcode();
3327      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
3328        continue;
3329      if (Opc != ISD::BUILD_VECTOR ||
3330          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
3331        return false;
3332    } else if (Idx >= 0) {
3333      unsigned Opc = V1.getOpcode();
3334      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
3335        continue;
3336      if (Opc != ISD::BUILD_VECTOR ||
3337          !X86::isZeroNode(V1.getOperand(Idx)))
3338        return false;
3339    }
3340  }
3341  return true;
3342}
3343
3344/// getZeroVector - Returns a vector of specified type with all zero elements.
3345///
3346static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
3347                             DebugLoc dl) {
3348  assert(VT.isVector() && "Expected a vector type");
3349
3350  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3351  // type.  This ensures they get CSE'd.
3352  SDValue Vec;
3353  if (VT.getSizeInBits() == 64) { // MMX
3354    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3355    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3356  } else if (HasSSE2) {  // SSE2
3357    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3358    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3359  } else { // SSE1
3360    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3361    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
3362  }
3363  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3364}
3365
3366/// getOnesVector - Returns a vector of specified type with all bits set.
3367///
3368static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3369  assert(VT.isVector() && "Expected a vector type");
3370
3371  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3372  // type.  This ensures they get CSE'd.
3373  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
3374  SDValue Vec;
3375  if (VT.getSizeInBits() == 64)  // MMX
3376    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3377  else                                              // SSE
3378    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3379  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3380}
3381
3382
3383/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
3384/// that point to V2 points to its first element.
3385static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3386  EVT VT = SVOp->getValueType(0);
3387  unsigned NumElems = VT.getVectorNumElements();
3388
3389  bool Changed = false;
3390  SmallVector<int, 8> MaskVec;
3391  SVOp->getMask(MaskVec);
3392
3393  for (unsigned i = 0; i != NumElems; ++i) {
3394    if (MaskVec[i] > (int)NumElems) {
3395      MaskVec[i] = NumElems;
3396      Changed = true;
3397    }
3398  }
3399  if (Changed)
3400    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
3401                                SVOp->getOperand(1), &MaskVec[0]);
3402  return SDValue(SVOp, 0);
3403}
3404
3405/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
3406/// operation of specified width.
3407static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3408                       SDValue V2) {
3409  unsigned NumElems = VT.getVectorNumElements();
3410  SmallVector<int, 8> Mask;
3411  Mask.push_back(NumElems);
3412  for (unsigned i = 1; i != NumElems; ++i)
3413    Mask.push_back(i);
3414  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3415}
3416
3417/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
3418static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3419                          SDValue V2) {
3420  unsigned NumElems = VT.getVectorNumElements();
3421  SmallVector<int, 8> Mask;
3422  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3423    Mask.push_back(i);
3424    Mask.push_back(i + NumElems);
3425  }
3426  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3427}
3428
3429/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
3430static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3431                          SDValue V2) {
3432  unsigned NumElems = VT.getVectorNumElements();
3433  unsigned Half = NumElems/2;
3434  SmallVector<int, 8> Mask;
3435  for (unsigned i = 0; i != Half; ++i) {
3436    Mask.push_back(i + Half);
3437    Mask.push_back(i + NumElems + Half);
3438  }
3439  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3440}
3441
3442/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
3443static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
3444                            bool HasSSE2) {
3445  if (SV->getValueType(0).getVectorNumElements() <= 4)
3446    return SDValue(SV, 0);
3447
3448  EVT PVT = MVT::v4f32;
3449  EVT VT = SV->getValueType(0);
3450  DebugLoc dl = SV->getDebugLoc();
3451  SDValue V1 = SV->getOperand(0);
3452  int NumElems = VT.getVectorNumElements();
3453  int EltNo = SV->getSplatIndex();
3454
3455  // unpack elements to the correct location
3456  while (NumElems > 4) {
3457    if (EltNo < NumElems/2) {
3458      V1 = getUnpackl(DAG, dl, VT, V1, V1);
3459    } else {
3460      V1 = getUnpackh(DAG, dl, VT, V1, V1);
3461      EltNo -= NumElems/2;
3462    }
3463    NumElems >>= 1;
3464  }
3465
3466  // Perform the splat.
3467  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3468  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3469  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3470  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3471}
3472
3473/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3474/// vector of zero or undef vector.  This produces a shuffle where the low
3475/// element of V2 is swizzled into the zero/undef vector, landing at element
3476/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3477static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3478                                             bool isZero, bool HasSSE2,
3479                                             SelectionDAG &DAG) {
3480  EVT VT = V2.getValueType();
3481  SDValue V1 = isZero
3482    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3483  unsigned NumElems = VT.getVectorNumElements();
3484  SmallVector<int, 16> MaskVec;
3485  for (unsigned i = 0; i != NumElems; ++i)
3486    // If this is the insertion idx, put the low elt of V2 here.
3487    MaskVec.push_back(i == Idx ? NumElems : i);
3488  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3489}
3490
3491/// getNumOfConsecutiveZeros - Return the number of elements in a result of
3492/// a shuffle that is zero.
3493static
3494unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3495                                  bool Low, SelectionDAG &DAG) {
3496  unsigned NumZeros = 0;
3497  for (int i = 0; i < NumElems; ++i) {
3498    unsigned Index = Low ? i : NumElems-i-1;
3499    int Idx = SVOp->getMaskElt(Index);
3500    if (Idx < 0) {
3501      ++NumZeros;
3502      continue;
3503    }
3504    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3505    if (Elt.getNode() && X86::isZeroNode(Elt))
3506      ++NumZeros;
3507    else
3508      break;
3509  }
3510  return NumZeros;
3511}
3512
3513/// isVectorShift - Returns true if the shuffle can be implemented as a
3514/// logical left or right shift of a vector.
3515/// FIXME: split into pslldqi, psrldqi, palignr variants.
3516static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3517                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3518  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
3519
3520  isLeft = true;
3521  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3522  if (!NumZeros) {
3523    isLeft = false;
3524    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3525    if (!NumZeros)
3526      return false;
3527  }
3528  bool SeenV1 = false;
3529  bool SeenV2 = false;
3530  for (unsigned i = NumZeros; i < NumElems; ++i) {
3531    unsigned Val = isLeft ? (i - NumZeros) : i;
3532    int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3533    if (Idx_ < 0)
3534      continue;
3535    unsigned Idx = (unsigned) Idx_;
3536    if (Idx < NumElems)
3537      SeenV1 = true;
3538    else {
3539      Idx -= NumElems;
3540      SeenV2 = true;
3541    }
3542    if (Idx != Val)
3543      return false;
3544  }
3545  if (SeenV1 && SeenV2)
3546    return false;
3547
3548  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3549  ShAmt = NumZeros;
3550  return true;
3551}
3552
3553
3554/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3555///
3556static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3557                                       unsigned NumNonZero, unsigned NumZero,
3558                                       SelectionDAG &DAG,
3559                                       const TargetLowering &TLI) {
3560  if (NumNonZero > 8)
3561    return SDValue();
3562
3563  DebugLoc dl = Op.getDebugLoc();
3564  SDValue V(0, 0);
3565  bool First = true;
3566  for (unsigned i = 0; i < 16; ++i) {
3567    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3568    if (ThisIsNonZero && First) {
3569      if (NumZero)
3570        V = getZeroVector(MVT::v8i16, true, DAG, dl);
3571      else
3572        V = DAG.getUNDEF(MVT::v8i16);
3573      First = false;
3574    }
3575
3576    if ((i & 1) != 0) {
3577      SDValue ThisElt(0, 0), LastElt(0, 0);
3578      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3579      if (LastIsNonZero) {
3580        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3581                              MVT::i16, Op.getOperand(i-1));
3582      }
3583      if (ThisIsNonZero) {
3584        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3585        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3586                              ThisElt, DAG.getConstant(8, MVT::i8));
3587        if (LastIsNonZero)
3588          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3589      } else
3590        ThisElt = LastElt;
3591
3592      if (ThisElt.getNode())
3593        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3594                        DAG.getIntPtrConstant(i/2));
3595    }
3596  }
3597
3598  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3599}
3600
3601/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3602///
3603static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3604                                     unsigned NumNonZero, unsigned NumZero,
3605                                     SelectionDAG &DAG,
3606                                     const TargetLowering &TLI) {
3607  if (NumNonZero > 4)
3608    return SDValue();
3609
3610  DebugLoc dl = Op.getDebugLoc();
3611  SDValue V(0, 0);
3612  bool First = true;
3613  for (unsigned i = 0; i < 8; ++i) {
3614    bool isNonZero = (NonZeros & (1 << i)) != 0;
3615    if (isNonZero) {
3616      if (First) {
3617        if (NumZero)
3618          V = getZeroVector(MVT::v8i16, true, DAG, dl);
3619        else
3620          V = DAG.getUNDEF(MVT::v8i16);
3621        First = false;
3622      }
3623      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3624                      MVT::v8i16, V, Op.getOperand(i),
3625                      DAG.getIntPtrConstant(i));
3626    }
3627  }
3628
3629  return V;
3630}
3631
3632/// getVShift - Return a vector logical shift node.
3633///
3634static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
3635                         unsigned NumBits, SelectionDAG &DAG,
3636                         const TargetLowering &TLI, DebugLoc dl) {
3637  bool isMMX = VT.getSizeInBits() == 64;
3638  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3639  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3640  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3641  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3642                     DAG.getNode(Opc, dl, ShVT, SrcOp,
3643                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3644}
3645
3646SDValue
3647X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
3648                                          SelectionDAG &DAG) const {
3649
3650  // Check if the scalar load can be widened into a vector load. And if
3651  // the address is "base + cst" see if the cst can be "absorbed" into
3652  // the shuffle mask.
3653  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
3654    SDValue Ptr = LD->getBasePtr();
3655    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
3656      return SDValue();
3657    EVT PVT = LD->getValueType(0);
3658    if (PVT != MVT::i32 && PVT != MVT::f32)
3659      return SDValue();
3660
3661    int FI = -1;
3662    int64_t Offset = 0;
3663    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
3664      FI = FINode->getIndex();
3665      Offset = 0;
3666    } else if (Ptr.getOpcode() == ISD::ADD &&
3667               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
3668               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
3669      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
3670      Offset = Ptr.getConstantOperandVal(1);
3671      Ptr = Ptr.getOperand(0);
3672    } else {
3673      return SDValue();
3674    }
3675
3676    SDValue Chain = LD->getChain();
3677    // Make sure the stack object alignment is at least 16.
3678    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3679    if (DAG.InferPtrAlignment(Ptr) < 16) {
3680      if (MFI->isFixedObjectIndex(FI)) {
3681        // Can't change the alignment. FIXME: It's possible to compute
3682        // the exact stack offset and reference FI + adjust offset instead.
3683        // If someone *really* cares about this. That's the way to implement it.
3684        return SDValue();
3685      } else {
3686        MFI->setObjectAlignment(FI, 16);
3687      }
3688    }
3689
3690    // (Offset % 16) must be multiple of 4. Then address is then
3691    // Ptr + (Offset & ~15).
3692    if (Offset < 0)
3693      return SDValue();
3694    if ((Offset % 16) & 3)
3695      return SDValue();
3696    int64_t StartOffset = Offset & ~15;
3697    if (StartOffset)
3698      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
3699                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
3700
3701    int EltNo = (Offset - StartOffset) >> 2;
3702    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
3703    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
3704    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
3705                             false, false, 0);
3706    // Canonicalize it to a v4i32 shuffle.
3707    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
3708    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3709                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
3710                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
3711  }
3712
3713  return SDValue();
3714}
3715
3716/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
3717/// vector of type 'VT', see if the elements can be replaced by a single large
3718/// load which has the same value as a build_vector whose operands are 'elts'.
3719///
3720/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
3721///
3722/// FIXME: we'd also like to handle the case where the last elements are zero
3723/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
3724/// There's even a handy isZeroNode for that purpose.
3725static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
3726                                        DebugLoc &dl, SelectionDAG &DAG) {
3727  EVT EltVT = VT.getVectorElementType();
3728  unsigned NumElems = Elts.size();
3729
3730  LoadSDNode *LDBase = NULL;
3731  unsigned LastLoadedElt = -1U;
3732
3733  // For each element in the initializer, see if we've found a load or an undef.
3734  // If we don't find an initial load element, or later load elements are
3735  // non-consecutive, bail out.
3736  for (unsigned i = 0; i < NumElems; ++i) {
3737    SDValue Elt = Elts[i];
3738
3739    if (!Elt.getNode() ||
3740        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
3741      return SDValue();
3742    if (!LDBase) {
3743      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
3744        return SDValue();
3745      LDBase = cast<LoadSDNode>(Elt.getNode());
3746      LastLoadedElt = i;
3747      continue;
3748    }
3749    if (Elt.getOpcode() == ISD::UNDEF)
3750      continue;
3751
3752    LoadSDNode *LD = cast<LoadSDNode>(Elt);
3753    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
3754      return SDValue();
3755    LastLoadedElt = i;
3756  }
3757
3758  // If we have found an entire vector of loads and undefs, then return a large
3759  // load of the entire vector width starting at the base pointer.  If we found
3760  // consecutive loads for the low half, generate a vzext_load node.
3761  if (LastLoadedElt == NumElems - 1) {
3762    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
3763      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
3764                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
3765                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
3766    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
3767                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
3768                       LDBase->isVolatile(), LDBase->isNonTemporal(),
3769                       LDBase->getAlignment());
3770  } else if (NumElems == 4 && LastLoadedElt == 1) {
3771    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
3772    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
3773    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
3774    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
3775  }
3776  return SDValue();
3777}
3778
3779SDValue
3780X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
3781  DebugLoc dl = Op.getDebugLoc();
3782  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3783  if (ISD::isBuildVectorAllZeros(Op.getNode())
3784      || ISD::isBuildVectorAllOnes(Op.getNode())) {
3785    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3786    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3787    // eliminated on x86-32 hosts.
3788    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3789      return Op;
3790
3791    if (ISD::isBuildVectorAllOnes(Op.getNode()))
3792      return getOnesVector(Op.getValueType(), DAG, dl);
3793    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3794  }
3795
3796  EVT VT = Op.getValueType();
3797  EVT ExtVT = VT.getVectorElementType();
3798  unsigned EVTBits = ExtVT.getSizeInBits();
3799
3800  unsigned NumElems = Op.getNumOperands();
3801  unsigned NumZero  = 0;
3802  unsigned NumNonZero = 0;
3803  unsigned NonZeros = 0;
3804  bool IsAllConstants = true;
3805  SmallSet<SDValue, 8> Values;
3806  for (unsigned i = 0; i < NumElems; ++i) {
3807    SDValue Elt = Op.getOperand(i);
3808    if (Elt.getOpcode() == ISD::UNDEF)
3809      continue;
3810    Values.insert(Elt);
3811    if (Elt.getOpcode() != ISD::Constant &&
3812        Elt.getOpcode() != ISD::ConstantFP)
3813      IsAllConstants = false;
3814    if (X86::isZeroNode(Elt))
3815      NumZero++;
3816    else {
3817      NonZeros |= (1 << i);
3818      NumNonZero++;
3819    }
3820  }
3821
3822  if (NumNonZero == 0) {
3823    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3824    return DAG.getUNDEF(VT);
3825  }
3826
3827  // Special case for single non-zero, non-undef, element.
3828  if (NumNonZero == 1) {
3829    unsigned Idx = CountTrailingZeros_32(NonZeros);
3830    SDValue Item = Op.getOperand(Idx);
3831
3832    // If this is an insertion of an i64 value on x86-32, and if the top bits of
3833    // the value are obviously zero, truncate the value to i32 and do the
3834    // insertion that way.  Only do this if the value is non-constant or if the
3835    // value is a constant being inserted into element 0.  It is cheaper to do
3836    // a constant pool load than it is to do a movd + shuffle.
3837    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
3838        (!IsAllConstants || Idx == 0)) {
3839      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3840        // Handle MMX and SSE both.
3841        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3842        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3843
3844        // Truncate the value (which may itself be a constant) to i32, and
3845        // convert it to a vector with movd (S2V+shuffle to zero extend).
3846        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3847        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3848        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3849                                           Subtarget->hasSSE2(), DAG);
3850
3851        // Now we have our 32-bit value zero extended in the low element of
3852        // a vector.  If Idx != 0, swizzle it into place.
3853        if (Idx != 0) {
3854          SmallVector<int, 4> Mask;
3855          Mask.push_back(Idx);
3856          for (unsigned i = 1; i != VecElts; ++i)
3857            Mask.push_back(i);
3858          Item = DAG.getVectorShuffle(VecVT, dl, Item,
3859                                      DAG.getUNDEF(Item.getValueType()),
3860                                      &Mask[0]);
3861        }
3862        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3863      }
3864    }
3865
3866    // If we have a constant or non-constant insertion into the low element of
3867    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3868    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3869    // depending on what the source datatype is.
3870    if (Idx == 0) {
3871      if (NumZero == 0) {
3872        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3873      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
3874          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
3875        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3876        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3877        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3878                                           DAG);
3879      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
3880        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3881        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3882        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3883        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3884                                           Subtarget->hasSSE2(), DAG);
3885        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3886      }
3887    }
3888
3889    // Is it a vector logical left shift?
3890    if (NumElems == 2 && Idx == 1 &&
3891        X86::isZeroNode(Op.getOperand(0)) &&
3892        !X86::isZeroNode(Op.getOperand(1))) {
3893      unsigned NumBits = VT.getSizeInBits();
3894      return getVShift(true, VT,
3895                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3896                                   VT, Op.getOperand(1)),
3897                       NumBits/2, DAG, *this, dl);
3898    }
3899
3900    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3901      return SDValue();
3902
3903    // Otherwise, if this is a vector with i32 or f32 elements, and the element
3904    // is a non-constant being inserted into an element other than the low one,
3905    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3906    // movd/movss) to move this into the low element, then shuffle it into
3907    // place.
3908    if (EVTBits == 32) {
3909      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3910
3911      // Turn it into a shuffle of zero and zero-extended scalar to vector.
3912      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3913                                         Subtarget->hasSSE2(), DAG);
3914      SmallVector<int, 8> MaskVec;
3915      for (unsigned i = 0; i < NumElems; i++)
3916        MaskVec.push_back(i == Idx ? 0 : 1);
3917      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3918    }
3919  }
3920
3921  // Splat is obviously ok. Let legalizer expand it to a shuffle.
3922  if (Values.size() == 1) {
3923    if (EVTBits == 32) {
3924      // Instead of a shuffle like this:
3925      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
3926      // Check if it's possible to issue this instead.
3927      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
3928      unsigned Idx = CountTrailingZeros_32(NonZeros);
3929      SDValue Item = Op.getOperand(Idx);
3930      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
3931        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
3932    }
3933    return SDValue();
3934  }
3935
3936  // A vector full of immediates; various special cases are already
3937  // handled, so this is best done with a single constant-pool load.
3938  if (IsAllConstants)
3939    return SDValue();
3940
3941  // Let legalizer expand 2-wide build_vectors.
3942  if (EVTBits == 64) {
3943    if (NumNonZero == 1) {
3944      // One half is zero or undef.
3945      unsigned Idx = CountTrailingZeros_32(NonZeros);
3946      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3947                                 Op.getOperand(Idx));
3948      return getShuffleVectorZeroOrUndef(V2, Idx, true,
3949                                         Subtarget->hasSSE2(), DAG);
3950    }
3951    return SDValue();
3952  }
3953
3954  // If element VT is < 32 bits, convert it to inserts into a zero vector.
3955  if (EVTBits == 8 && NumElems == 16) {
3956    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3957                                        *this);
3958    if (V.getNode()) return V;
3959  }
3960
3961  if (EVTBits == 16 && NumElems == 8) {
3962    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3963                                        *this);
3964    if (V.getNode()) return V;
3965  }
3966
3967  // If element VT is == 32 bits, turn it into a number of shuffles.
3968  SmallVector<SDValue, 8> V;
3969  V.resize(NumElems);
3970  if (NumElems == 4 && NumZero > 0) {
3971    for (unsigned i = 0; i < 4; ++i) {
3972      bool isZero = !(NonZeros & (1 << i));
3973      if (isZero)
3974        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3975      else
3976        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3977    }
3978
3979    for (unsigned i = 0; i < 2; ++i) {
3980      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3981        default: break;
3982        case 0:
3983          V[i] = V[i*2];  // Must be a zero vector.
3984          break;
3985        case 1:
3986          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3987          break;
3988        case 2:
3989          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3990          break;
3991        case 3:
3992          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3993          break;
3994      }
3995    }
3996
3997    SmallVector<int, 8> MaskVec;
3998    bool Reverse = (NonZeros & 0x3) == 2;
3999    for (unsigned i = 0; i < 2; ++i)
4000      MaskVec.push_back(Reverse ? 1-i : i);
4001    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
4002    for (unsigned i = 0; i < 2; ++i)
4003      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
4004    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
4005  }
4006
4007  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
4008    // Check for a build vector of consecutive loads.
4009    for (unsigned i = 0; i < NumElems; ++i)
4010      V[i] = Op.getOperand(i);
4011
4012    // Check for elements which are consecutive loads.
4013    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
4014    if (LD.getNode())
4015      return LD;
4016
4017    // For SSE 4.1, use inserts into undef.
4018    if (getSubtarget()->hasSSE41()) {
4019      V[0] = DAG.getUNDEF(VT);
4020      for (unsigned i = 0; i < NumElems; ++i)
4021        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
4022          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
4023                             Op.getOperand(i), DAG.getIntPtrConstant(i));
4024      return V[0];
4025    }
4026
4027    // Otherwise, expand into a number of unpckl*
4028    // e.g. for v4f32
4029    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
4030    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
4031    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
4032    for (unsigned i = 0; i < NumElems; ++i)
4033      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
4034    NumElems >>= 1;
4035    while (NumElems != 0) {
4036      for (unsigned i = 0; i < NumElems; ++i)
4037        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
4038      NumElems >>= 1;
4039    }
4040    return V[0];
4041  }
4042  return SDValue();
4043}
4044
4045SDValue
4046X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
4047  // We support concatenate two MMX registers and place them in a MMX
4048  // register.  This is better than doing a stack convert.
4049  DebugLoc dl = Op.getDebugLoc();
4050  EVT ResVT = Op.getValueType();
4051  assert(Op.getNumOperands() == 2);
4052  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
4053         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
4054  int Mask[2];
4055  SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
4056  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
4057  InVec = Op.getOperand(1);
4058  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4059    unsigned NumElts = ResVT.getVectorNumElements();
4060    VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4061    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
4062                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
4063  } else {
4064    InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
4065    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
4066    Mask[0] = 0; Mask[1] = 2;
4067    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
4068  }
4069  return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4070}
4071
4072// v8i16 shuffles - Prefer shuffles in the following order:
4073// 1. [all]   pshuflw, pshufhw, optional move
4074// 2. [ssse3] 1 x pshufb
4075// 3. [ssse3] 2 x pshufb + 1 x por
4076// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
4077static
4078SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
4079                                 SelectionDAG &DAG,
4080                                 const X86TargetLowering &TLI) {
4081  SDValue V1 = SVOp->getOperand(0);
4082  SDValue V2 = SVOp->getOperand(1);
4083  DebugLoc dl = SVOp->getDebugLoc();
4084  SmallVector<int, 8> MaskVals;
4085
4086  // Determine if more than 1 of the words in each of the low and high quadwords
4087  // of the result come from the same quadword of one of the two inputs.  Undef
4088  // mask values count as coming from any quadword, for better codegen.
4089  SmallVector<unsigned, 4> LoQuad(4);
4090  SmallVector<unsigned, 4> HiQuad(4);
4091  BitVector InputQuads(4);
4092  for (unsigned i = 0; i < 8; ++i) {
4093    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
4094    int EltIdx = SVOp->getMaskElt(i);
4095    MaskVals.push_back(EltIdx);
4096    if (EltIdx < 0) {
4097      ++Quad[0];
4098      ++Quad[1];
4099      ++Quad[2];
4100      ++Quad[3];
4101      continue;
4102    }
4103    ++Quad[EltIdx / 4];
4104    InputQuads.set(EltIdx / 4);
4105  }
4106
4107  int BestLoQuad = -1;
4108  unsigned MaxQuad = 1;
4109  for (unsigned i = 0; i < 4; ++i) {
4110    if (LoQuad[i] > MaxQuad) {
4111      BestLoQuad = i;
4112      MaxQuad = LoQuad[i];
4113    }
4114  }
4115
4116  int BestHiQuad = -1;
4117  MaxQuad = 1;
4118  for (unsigned i = 0; i < 4; ++i) {
4119    if (HiQuad[i] > MaxQuad) {
4120      BestHiQuad = i;
4121      MaxQuad = HiQuad[i];
4122    }
4123  }
4124
4125  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
4126  // of the two input vectors, shuffle them into one input vector so only a
4127  // single pshufb instruction is necessary. If There are more than 2 input
4128  // quads, disable the next transformation since it does not help SSSE3.
4129  bool V1Used = InputQuads[0] || InputQuads[1];
4130  bool V2Used = InputQuads[2] || InputQuads[3];
4131  if (TLI.getSubtarget()->hasSSSE3()) {
4132    if (InputQuads.count() == 2 && V1Used && V2Used) {
4133      BestLoQuad = InputQuads.find_first();
4134      BestHiQuad = InputQuads.find_next(BestLoQuad);
4135    }
4136    if (InputQuads.count() > 2) {
4137      BestLoQuad = -1;
4138      BestHiQuad = -1;
4139    }
4140  }
4141
4142  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
4143  // the shuffle mask.  If a quad is scored as -1, that means that it contains
4144  // words from all 4 input quadwords.
4145  SDValue NewV;
4146  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
4147    SmallVector<int, 8> MaskV;
4148    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
4149    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
4150    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
4151                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
4152                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
4153    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
4154
4155    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
4156    // source words for the shuffle, to aid later transformations.
4157    bool AllWordsInNewV = true;
4158    bool InOrder[2] = { true, true };
4159    for (unsigned i = 0; i != 8; ++i) {
4160      int idx = MaskVals[i];
4161      if (idx != (int)i)
4162        InOrder[i/4] = false;
4163      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
4164        continue;
4165      AllWordsInNewV = false;
4166      break;
4167    }
4168
4169    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
4170    if (AllWordsInNewV) {
4171      for (int i = 0; i != 8; ++i) {
4172        int idx = MaskVals[i];
4173        if (idx < 0)
4174          continue;
4175        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
4176        if ((idx != i) && idx < 4)
4177          pshufhw = false;
4178        if ((idx != i) && idx > 3)
4179          pshuflw = false;
4180      }
4181      V1 = NewV;
4182      V2Used = false;
4183      BestLoQuad = 0;
4184      BestHiQuad = 1;
4185    }
4186
4187    // If we've eliminated the use of V2, and the new mask is a pshuflw or
4188    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
4189    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
4190      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
4191                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
4192    }
4193  }
4194
4195  // If we have SSSE3, and all words of the result are from 1 input vector,
4196  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
4197  // is present, fall back to case 4.
4198  if (TLI.getSubtarget()->hasSSSE3()) {
4199    SmallVector<SDValue,16> pshufbMask;
4200
4201    // If we have elements from both input vectors, set the high bit of the
4202    // shuffle mask element to zero out elements that come from V2 in the V1
4203    // mask, and elements that come from V1 in the V2 mask, so that the two
4204    // results can be OR'd together.
4205    bool TwoInputs = V1Used && V2Used;
4206    for (unsigned i = 0; i != 8; ++i) {
4207      int EltIdx = MaskVals[i] * 2;
4208      if (TwoInputs && (EltIdx >= 16)) {
4209        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4210        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4211        continue;
4212      }
4213      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
4214      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
4215    }
4216    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
4217    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4218                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4219                                 MVT::v16i8, &pshufbMask[0], 16));
4220    if (!TwoInputs)
4221      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4222
4223    // Calculate the shuffle mask for the second input, shuffle it, and
4224    // OR it with the first shuffled input.
4225    pshufbMask.clear();
4226    for (unsigned i = 0; i != 8; ++i) {
4227      int EltIdx = MaskVals[i] * 2;
4228      if (EltIdx < 16) {
4229        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4230        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4231        continue;
4232      }
4233      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4234      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
4235    }
4236    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
4237    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4238                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4239                                 MVT::v16i8, &pshufbMask[0], 16));
4240    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4241    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4242  }
4243
4244  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
4245  // and update MaskVals with new element order.
4246  BitVector InOrder(8);
4247  if (BestLoQuad >= 0) {
4248    SmallVector<int, 8> MaskV;
4249    for (int i = 0; i != 4; ++i) {
4250      int idx = MaskVals[i];
4251      if (idx < 0) {
4252        MaskV.push_back(-1);
4253        InOrder.set(i);
4254      } else if ((idx / 4) == BestLoQuad) {
4255        MaskV.push_back(idx & 3);
4256        InOrder.set(i);
4257      } else {
4258        MaskV.push_back(-1);
4259      }
4260    }
4261    for (unsigned i = 4; i != 8; ++i)
4262      MaskV.push_back(i);
4263    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4264                                &MaskV[0]);
4265  }
4266
4267  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
4268  // and update MaskVals with the new element order.
4269  if (BestHiQuad >= 0) {
4270    SmallVector<int, 8> MaskV;
4271    for (unsigned i = 0; i != 4; ++i)
4272      MaskV.push_back(i);
4273    for (unsigned i = 4; i != 8; ++i) {
4274      int idx = MaskVals[i];
4275      if (idx < 0) {
4276        MaskV.push_back(-1);
4277        InOrder.set(i);
4278      } else if ((idx / 4) == BestHiQuad) {
4279        MaskV.push_back((idx & 3) + 4);
4280        InOrder.set(i);
4281      } else {
4282        MaskV.push_back(-1);
4283      }
4284    }
4285    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4286                                &MaskV[0]);
4287  }
4288
4289  // In case BestHi & BestLo were both -1, which means each quadword has a word
4290  // from each of the four input quadwords, calculate the InOrder bitvector now
4291  // before falling through to the insert/extract cleanup.
4292  if (BestLoQuad == -1 && BestHiQuad == -1) {
4293    NewV = V1;
4294    for (int i = 0; i != 8; ++i)
4295      if (MaskVals[i] < 0 || MaskVals[i] == i)
4296        InOrder.set(i);
4297  }
4298
4299  // The other elements are put in the right place using pextrw and pinsrw.
4300  for (unsigned i = 0; i != 8; ++i) {
4301    if (InOrder[i])
4302      continue;
4303    int EltIdx = MaskVals[i];
4304    if (EltIdx < 0)
4305      continue;
4306    SDValue ExtOp = (EltIdx < 8)
4307    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
4308                  DAG.getIntPtrConstant(EltIdx))
4309    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
4310                  DAG.getIntPtrConstant(EltIdx - 8));
4311    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
4312                       DAG.getIntPtrConstant(i));
4313  }
4314  return NewV;
4315}
4316
4317// v16i8 shuffles - Prefer shuffles in the following order:
4318// 1. [ssse3] 1 x pshufb
4319// 2. [ssse3] 2 x pshufb + 1 x por
4320// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
4321static
4322SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
4323                                 SelectionDAG &DAG,
4324                                 const X86TargetLowering &TLI) {
4325  SDValue V1 = SVOp->getOperand(0);
4326  SDValue V2 = SVOp->getOperand(1);
4327  DebugLoc dl = SVOp->getDebugLoc();
4328  SmallVector<int, 16> MaskVals;
4329  SVOp->getMask(MaskVals);
4330
4331  // If we have SSSE3, case 1 is generated when all result bytes come from
4332  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
4333  // present, fall back to case 3.
4334  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
4335  bool V1Only = true;
4336  bool V2Only = true;
4337  for (unsigned i = 0; i < 16; ++i) {
4338    int EltIdx = MaskVals[i];
4339    if (EltIdx < 0)
4340      continue;
4341    if (EltIdx < 16)
4342      V2Only = false;
4343    else
4344      V1Only = false;
4345  }
4346
4347  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
4348  if (TLI.getSubtarget()->hasSSSE3()) {
4349    SmallVector<SDValue,16> pshufbMask;
4350
4351    // If all result elements are from one input vector, then only translate
4352    // undef mask values to 0x80 (zero out result) in the pshufb mask.
4353    //
4354    // Otherwise, we have elements from both input vectors, and must zero out
4355    // elements that come from V2 in the first mask, and V1 in the second mask
4356    // so that we can OR them together.
4357    bool TwoInputs = !(V1Only || V2Only);
4358    for (unsigned i = 0; i != 16; ++i) {
4359      int EltIdx = MaskVals[i];
4360      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
4361        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4362        continue;
4363      }
4364      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
4365    }
4366    // If all the elements are from V2, assign it to V1 and return after
4367    // building the first pshufb.
4368    if (V2Only)
4369      V1 = V2;
4370    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4371                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4372                                 MVT::v16i8, &pshufbMask[0], 16));
4373    if (!TwoInputs)
4374      return V1;
4375
4376    // Calculate the shuffle mask for the second input, shuffle it, and
4377    // OR it with the first shuffled input.
4378    pshufbMask.clear();
4379    for (unsigned i = 0; i != 16; ++i) {
4380      int EltIdx = MaskVals[i];
4381      if (EltIdx < 16) {
4382        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4383        continue;
4384      }
4385      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4386    }
4387    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4388                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4389                                 MVT::v16i8, &pshufbMask[0], 16));
4390    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4391  }
4392
4393  // No SSSE3 - Calculate in place words and then fix all out of place words
4394  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
4395  // the 16 different words that comprise the two doublequadword input vectors.
4396  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4397  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
4398  SDValue NewV = V2Only ? V2 : V1;
4399  for (int i = 0; i != 8; ++i) {
4400    int Elt0 = MaskVals[i*2];
4401    int Elt1 = MaskVals[i*2+1];
4402
4403    // This word of the result is all undef, skip it.
4404    if (Elt0 < 0 && Elt1 < 0)
4405      continue;
4406
4407    // This word of the result is already in the correct place, skip it.
4408    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
4409      continue;
4410    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
4411      continue;
4412
4413    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
4414    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
4415    SDValue InsElt;
4416
4417    // If Elt0 and Elt1 are defined, are consecutive, and can be load
4418    // using a single extract together, load it and store it.
4419    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
4420      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4421                           DAG.getIntPtrConstant(Elt1 / 2));
4422      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4423                        DAG.getIntPtrConstant(i));
4424      continue;
4425    }
4426
4427    // If Elt1 is defined, extract it from the appropriate source.  If the
4428    // source byte is not also odd, shift the extracted word left 8 bits
4429    // otherwise clear the bottom 8 bits if we need to do an or.
4430    if (Elt1 >= 0) {
4431      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4432                           DAG.getIntPtrConstant(Elt1 / 2));
4433      if ((Elt1 & 1) == 0)
4434        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
4435                             DAG.getConstant(8, TLI.getShiftAmountTy()));
4436      else if (Elt0 >= 0)
4437        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
4438                             DAG.getConstant(0xFF00, MVT::i16));
4439    }
4440    // If Elt0 is defined, extract it from the appropriate source.  If the
4441    // source byte is not also even, shift the extracted word right 8 bits. If
4442    // Elt1 was also defined, OR the extracted values together before
4443    // inserting them in the result.
4444    if (Elt0 >= 0) {
4445      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
4446                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
4447      if ((Elt0 & 1) != 0)
4448        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
4449                              DAG.getConstant(8, TLI.getShiftAmountTy()));
4450      else if (Elt1 >= 0)
4451        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
4452                             DAG.getConstant(0x00FF, MVT::i16));
4453      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
4454                         : InsElt0;
4455    }
4456    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4457                       DAG.getIntPtrConstant(i));
4458  }
4459  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
4460}
4461
4462/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
4463/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be
4464/// done when every pair / quad of shuffle mask elements point to elements in
4465/// the right sequence. e.g.
4466/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
4467static
4468SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
4469                                 SelectionDAG &DAG,
4470                                 const TargetLowering &TLI, DebugLoc dl) {
4471  EVT VT = SVOp->getValueType(0);
4472  SDValue V1 = SVOp->getOperand(0);
4473  SDValue V2 = SVOp->getOperand(1);
4474  unsigned NumElems = VT.getVectorNumElements();
4475  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
4476  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
4477  EVT NewVT = MaskVT;
4478  switch (VT.getSimpleVT().SimpleTy) {
4479  default: assert(false && "Unexpected!");
4480  case MVT::v4f32: NewVT = MVT::v2f64; break;
4481  case MVT::v4i32: NewVT = MVT::v2i64; break;
4482  case MVT::v8i16: NewVT = MVT::v4i32; break;
4483  case MVT::v16i8: NewVT = MVT::v4i32; break;
4484  }
4485
4486  if (NewWidth == 2) {
4487    if (VT.isInteger())
4488      NewVT = MVT::v2i64;
4489    else
4490      NewVT = MVT::v2f64;
4491  }
4492  int Scale = NumElems / NewWidth;
4493  SmallVector<int, 8> MaskVec;
4494  for (unsigned i = 0; i < NumElems; i += Scale) {
4495    int StartIdx = -1;
4496    for (int j = 0; j < Scale; ++j) {
4497      int EltIdx = SVOp->getMaskElt(i+j);
4498      if (EltIdx < 0)
4499        continue;
4500      if (StartIdx == -1)
4501        StartIdx = EltIdx - (EltIdx % Scale);
4502      if (EltIdx != StartIdx + j)
4503        return SDValue();
4504    }
4505    if (StartIdx == -1)
4506      MaskVec.push_back(-1);
4507    else
4508      MaskVec.push_back(StartIdx / Scale);
4509  }
4510
4511  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
4512  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
4513  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
4514}
4515
4516/// getVZextMovL - Return a zero-extending vector move low node.
4517///
4518static SDValue getVZextMovL(EVT VT, EVT OpVT,
4519                            SDValue SrcOp, SelectionDAG &DAG,
4520                            const X86Subtarget *Subtarget, DebugLoc dl) {
4521  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
4522    LoadSDNode *LD = NULL;
4523    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
4524      LD = dyn_cast<LoadSDNode>(SrcOp);
4525    if (!LD) {
4526      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
4527      // instead.
4528      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
4529      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
4530          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
4531          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
4532          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
4533        // PR2108
4534        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
4535        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4536                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4537                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4538                                                   OpVT,
4539                                                   SrcOp.getOperand(0)
4540                                                          .getOperand(0))));
4541      }
4542    }
4543  }
4544
4545  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4546                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4547                                 DAG.getNode(ISD::BIT_CONVERT, dl,
4548                                             OpVT, SrcOp)));
4549}
4550
4551/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
4552/// shuffles.
4553static SDValue
4554LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4555  SDValue V1 = SVOp->getOperand(0);
4556  SDValue V2 = SVOp->getOperand(1);
4557  DebugLoc dl = SVOp->getDebugLoc();
4558  EVT VT = SVOp->getValueType(0);
4559
4560  SmallVector<std::pair<int, int>, 8> Locs;
4561  Locs.resize(4);
4562  SmallVector<int, 8> Mask1(4U, -1);
4563  SmallVector<int, 8> PermMask;
4564  SVOp->getMask(PermMask);
4565
4566  unsigned NumHi = 0;
4567  unsigned NumLo = 0;
4568  for (unsigned i = 0; i != 4; ++i) {
4569    int Idx = PermMask[i];
4570    if (Idx < 0) {
4571      Locs[i] = std::make_pair(-1, -1);
4572    } else {
4573      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
4574      if (Idx < 4) {
4575        Locs[i] = std::make_pair(0, NumLo);
4576        Mask1[NumLo] = Idx;
4577        NumLo++;
4578      } else {
4579        Locs[i] = std::make_pair(1, NumHi);
4580        if (2+NumHi < 4)
4581          Mask1[2+NumHi] = Idx;
4582        NumHi++;
4583      }
4584    }
4585  }
4586
4587  if (NumLo <= 2 && NumHi <= 2) {
4588    // If no more than two elements come from either vector. This can be
4589    // implemented with two shuffles. First shuffle gather the elements.
4590    // The second shuffle, which takes the first shuffle as both of its
4591    // vector operands, put the elements into the right order.
4592    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4593
4594    SmallVector<int, 8> Mask2(4U, -1);
4595
4596    for (unsigned i = 0; i != 4; ++i) {
4597      if (Locs[i].first == -1)
4598        continue;
4599      else {
4600        unsigned Idx = (i < 2) ? 0 : 4;
4601        Idx += Locs[i].first * 2 + Locs[i].second;
4602        Mask2[i] = Idx;
4603      }
4604    }
4605
4606    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
4607  } else if (NumLo == 3 || NumHi == 3) {
4608    // Otherwise, we must have three elements from one vector, call it X, and
4609    // one element from the other, call it Y.  First, use a shufps to build an
4610    // intermediate vector with the one element from Y and the element from X
4611    // that will be in the same half in the final destination (the indexes don't
4612    // matter). Then, use a shufps to build the final vector, taking the half
4613    // containing the element from Y from the intermediate, and the other half
4614    // from X.
4615    if (NumHi == 3) {
4616      // Normalize it so the 3 elements come from V1.
4617      CommuteVectorShuffleMask(PermMask, VT);
4618      std::swap(V1, V2);
4619    }
4620
4621    // Find the element from V2.
4622    unsigned HiIndex;
4623    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
4624      int Val = PermMask[HiIndex];
4625      if (Val < 0)
4626        continue;
4627      if (Val >= 4)
4628        break;
4629    }
4630
4631    Mask1[0] = PermMask[HiIndex];
4632    Mask1[1] = -1;
4633    Mask1[2] = PermMask[HiIndex^1];
4634    Mask1[3] = -1;
4635    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4636
4637    if (HiIndex >= 2) {
4638      Mask1[0] = PermMask[0];
4639      Mask1[1] = PermMask[1];
4640      Mask1[2] = HiIndex & 1 ? 6 : 4;
4641      Mask1[3] = HiIndex & 1 ? 4 : 6;
4642      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4643    } else {
4644      Mask1[0] = HiIndex & 1 ? 2 : 0;
4645      Mask1[1] = HiIndex & 1 ? 0 : 2;
4646      Mask1[2] = PermMask[2];
4647      Mask1[3] = PermMask[3];
4648      if (Mask1[2] >= 0)
4649        Mask1[2] += 4;
4650      if (Mask1[3] >= 0)
4651        Mask1[3] += 4;
4652      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4653    }
4654  }
4655
4656  // Break it into (shuffle shuffle_hi, shuffle_lo).
4657  Locs.clear();
4658  SmallVector<int,8> LoMask(4U, -1);
4659  SmallVector<int,8> HiMask(4U, -1);
4660
4661  SmallVector<int,8> *MaskPtr = &LoMask;
4662  unsigned MaskIdx = 0;
4663  unsigned LoIdx = 0;
4664  unsigned HiIdx = 2;
4665  for (unsigned i = 0; i != 4; ++i) {
4666    if (i == 2) {
4667      MaskPtr = &HiMask;
4668      MaskIdx = 1;
4669      LoIdx = 0;
4670      HiIdx = 2;
4671    }
4672    int Idx = PermMask[i];
4673    if (Idx < 0) {
4674      Locs[i] = std::make_pair(-1, -1);
4675    } else if (Idx < 4) {
4676      Locs[i] = std::make_pair(MaskIdx, LoIdx);
4677      (*MaskPtr)[LoIdx] = Idx;
4678      LoIdx++;
4679    } else {
4680      Locs[i] = std::make_pair(MaskIdx, HiIdx);
4681      (*MaskPtr)[HiIdx] = Idx;
4682      HiIdx++;
4683    }
4684  }
4685
4686  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4687  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4688  SmallVector<int, 8> MaskOps;
4689  for (unsigned i = 0; i != 4; ++i) {
4690    if (Locs[i].first == -1) {
4691      MaskOps.push_back(-1);
4692    } else {
4693      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4694      MaskOps.push_back(Idx);
4695    }
4696  }
4697  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4698}
4699
4700SDValue
4701X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
4702  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4703  SDValue V1 = Op.getOperand(0);
4704  SDValue V2 = Op.getOperand(1);
4705  EVT VT = Op.getValueType();
4706  DebugLoc dl = Op.getDebugLoc();
4707  unsigned NumElems = VT.getVectorNumElements();
4708  bool isMMX = VT.getSizeInBits() == 64;
4709  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4710  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4711  bool V1IsSplat = false;
4712  bool V2IsSplat = false;
4713
4714  if (isZeroShuffle(SVOp))
4715    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4716
4717  // Promote splats to v4f32.
4718  if (SVOp->isSplat()) {
4719    if (isMMX || NumElems < 4)
4720      return Op;
4721    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4722  }
4723
4724  // If the shuffle can be profitably rewritten as a narrower shuffle, then
4725  // do it!
4726  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4727    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4728    if (NewOp.getNode())
4729      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4730                         LowerVECTOR_SHUFFLE(NewOp, DAG));
4731  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4732    // FIXME: Figure out a cleaner way to do this.
4733    // Try to make use of movq to zero out the top part.
4734    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4735      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4736      if (NewOp.getNode()) {
4737        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4738          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4739                              DAG, Subtarget, dl);
4740      }
4741    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4742      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4743      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4744        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4745                            DAG, Subtarget, dl);
4746    }
4747  }
4748
4749  if (X86::isPSHUFDMask(SVOp))
4750    return Op;
4751
4752  // Check if this can be converted into a logical shift.
4753  bool isLeft = false;
4754  unsigned ShAmt = 0;
4755  SDValue ShVal;
4756  bool isShift = getSubtarget()->hasSSE2() &&
4757    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4758  if (isShift && ShVal.hasOneUse()) {
4759    // If the shifted value has multiple uses, it may be cheaper to use
4760    // v_set0 + movlhps or movhlps, etc.
4761    EVT EltVT = VT.getVectorElementType();
4762    ShAmt *= EltVT.getSizeInBits();
4763    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4764  }
4765
4766  if (X86::isMOVLMask(SVOp)) {
4767    if (V1IsUndef)
4768      return V2;
4769    if (ISD::isBuildVectorAllZeros(V1.getNode()))
4770      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4771    if (!isMMX)
4772      return Op;
4773  }
4774
4775  // FIXME: fold these into legal mask.
4776  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4777                 X86::isMOVSLDUPMask(SVOp) ||
4778                 X86::isMOVHLPSMask(SVOp) ||
4779                 X86::isMOVLHPSMask(SVOp) ||
4780                 X86::isMOVLPMask(SVOp)))
4781    return Op;
4782
4783  if (ShouldXformToMOVHLPS(SVOp) ||
4784      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4785    return CommuteVectorShuffle(SVOp, DAG);
4786
4787  if (isShift) {
4788    // No better options. Use a vshl / vsrl.
4789    EVT EltVT = VT.getVectorElementType();
4790    ShAmt *= EltVT.getSizeInBits();
4791    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4792  }
4793
4794  bool Commuted = false;
4795  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4796  // 1,1,1,1 -> v8i16 though.
4797  V1IsSplat = isSplatVector(V1.getNode());
4798  V2IsSplat = isSplatVector(V2.getNode());
4799
4800  // Canonicalize the splat or undef, if present, to be on the RHS.
4801  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4802    Op = CommuteVectorShuffle(SVOp, DAG);
4803    SVOp = cast<ShuffleVectorSDNode>(Op);
4804    V1 = SVOp->getOperand(0);
4805    V2 = SVOp->getOperand(1);
4806    std::swap(V1IsSplat, V2IsSplat);
4807    std::swap(V1IsUndef, V2IsUndef);
4808    Commuted = true;
4809  }
4810
4811  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4812    // Shuffling low element of v1 into undef, just return v1.
4813    if (V2IsUndef)
4814      return V1;
4815    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4816    // the instruction selector will not match, so get a canonical MOVL with
4817    // swapped operands to undo the commute.
4818    return getMOVL(DAG, dl, VT, V2, V1);
4819  }
4820
4821  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4822      X86::isUNPCKH_v_undef_Mask(SVOp) ||
4823      X86::isUNPCKLMask(SVOp) ||
4824      X86::isUNPCKHMask(SVOp))
4825    return Op;
4826
4827  if (V2IsSplat) {
4828    // Normalize mask so all entries that point to V2 points to its first
4829    // element then try to match unpck{h|l} again. If match, return a
4830    // new vector_shuffle with the corrected mask.
4831    SDValue NewMask = NormalizeMask(SVOp, DAG);
4832    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4833    if (NSVOp != SVOp) {
4834      if (X86::isUNPCKLMask(NSVOp, true)) {
4835        return NewMask;
4836      } else if (X86::isUNPCKHMask(NSVOp, true)) {
4837        return NewMask;
4838      }
4839    }
4840  }
4841
4842  if (Commuted) {
4843    // Commute is back and try unpck* again.
4844    // FIXME: this seems wrong.
4845    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4846    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4847    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4848        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4849        X86::isUNPCKLMask(NewSVOp) ||
4850        X86::isUNPCKHMask(NewSVOp))
4851      return NewOp;
4852  }
4853
4854  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4855
4856  // Normalize the node to match x86 shuffle ops if needed
4857  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4858    return CommuteVectorShuffle(SVOp, DAG);
4859
4860  // Check for legal shuffle and return?
4861  SmallVector<int, 16> PermMask;
4862  SVOp->getMask(PermMask);
4863  if (isShuffleMaskLegal(PermMask, VT))
4864    return Op;
4865
4866  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4867  if (VT == MVT::v8i16) {
4868    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4869    if (NewOp.getNode())
4870      return NewOp;
4871  }
4872
4873  if (VT == MVT::v16i8) {
4874    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4875    if (NewOp.getNode())
4876      return NewOp;
4877  }
4878
4879  // Handle all 4 wide cases with a number of shuffles except for MMX.
4880  if (NumElems == 4 && !isMMX)
4881    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4882
4883  return SDValue();
4884}
4885
4886SDValue
4887X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4888                                                SelectionDAG &DAG) const {
4889  EVT VT = Op.getValueType();
4890  DebugLoc dl = Op.getDebugLoc();
4891  if (VT.getSizeInBits() == 8) {
4892    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4893                                    Op.getOperand(0), Op.getOperand(1));
4894    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4895                                    DAG.getValueType(VT));
4896    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4897  } else if (VT.getSizeInBits() == 16) {
4898    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4899    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4900    if (Idx == 0)
4901      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4902                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4903                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4904                                                 MVT::v4i32,
4905                                                 Op.getOperand(0)),
4906                                     Op.getOperand(1)));
4907    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4908                                    Op.getOperand(0), Op.getOperand(1));
4909    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4910                                    DAG.getValueType(VT));
4911    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4912  } else if (VT == MVT::f32) {
4913    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4914    // the result back to FR32 register. It's only worth matching if the
4915    // result has a single use which is a store or a bitcast to i32.  And in
4916    // the case of a store, it's not worth it if the index is a constant 0,
4917    // because a MOVSSmr can be used instead, which is smaller and faster.
4918    if (!Op.hasOneUse())
4919      return SDValue();
4920    SDNode *User = *Op.getNode()->use_begin();
4921    if ((User->getOpcode() != ISD::STORE ||
4922         (isa<ConstantSDNode>(Op.getOperand(1)) &&
4923          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4924        (User->getOpcode() != ISD::BIT_CONVERT ||
4925         User->getValueType(0) != MVT::i32))
4926      return SDValue();
4927    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4928                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4929                                              Op.getOperand(0)),
4930                                              Op.getOperand(1));
4931    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4932  } else if (VT == MVT::i32) {
4933    // ExtractPS works with constant index.
4934    if (isa<ConstantSDNode>(Op.getOperand(1)))
4935      return Op;
4936  }
4937  return SDValue();
4938}
4939
4940
4941SDValue
4942X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
4943                                           SelectionDAG &DAG) const {
4944  if (!isa<ConstantSDNode>(Op.getOperand(1)))
4945    return SDValue();
4946
4947  if (Subtarget->hasSSE41()) {
4948    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4949    if (Res.getNode())
4950      return Res;
4951  }
4952
4953  EVT VT = Op.getValueType();
4954  DebugLoc dl = Op.getDebugLoc();
4955  // TODO: handle v16i8.
4956  if (VT.getSizeInBits() == 16) {
4957    SDValue Vec = Op.getOperand(0);
4958    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4959    if (Idx == 0)
4960      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4961                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4962                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4963                                                 MVT::v4i32, Vec),
4964                                     Op.getOperand(1)));
4965    // Transform it so it match pextrw which produces a 32-bit result.
4966    EVT EltVT = MVT::i32;
4967    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
4968                                    Op.getOperand(0), Op.getOperand(1));
4969    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
4970                                    DAG.getValueType(VT));
4971    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4972  } else if (VT.getSizeInBits() == 32) {
4973    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4974    if (Idx == 0)
4975      return Op;
4976
4977    // SHUFPS the element to the lowest double word, then movss.
4978    int Mask[4] = { Idx, -1, -1, -1 };
4979    EVT VVT = Op.getOperand(0).getValueType();
4980    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4981                                       DAG.getUNDEF(VVT), Mask);
4982    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4983                       DAG.getIntPtrConstant(0));
4984  } else if (VT.getSizeInBits() == 64) {
4985    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4986    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4987    //        to match extract_elt for f64.
4988    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4989    if (Idx == 0)
4990      return Op;
4991
4992    // UNPCKHPD the element to the lowest double word, then movsd.
4993    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4994    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4995    int Mask[2] = { 1, -1 };
4996    EVT VVT = Op.getOperand(0).getValueType();
4997    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4998                                       DAG.getUNDEF(VVT), Mask);
4999    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
5000                       DAG.getIntPtrConstant(0));
5001  }
5002
5003  return SDValue();
5004}
5005
5006SDValue
5007X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
5008                                               SelectionDAG &DAG) const {
5009  EVT VT = Op.getValueType();
5010  EVT EltVT = VT.getVectorElementType();
5011  DebugLoc dl = Op.getDebugLoc();
5012
5013  SDValue N0 = Op.getOperand(0);
5014  SDValue N1 = Op.getOperand(1);
5015  SDValue N2 = Op.getOperand(2);
5016
5017  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
5018      isa<ConstantSDNode>(N2)) {
5019    unsigned Opc;
5020    if (VT == MVT::v8i16)
5021      Opc = X86ISD::PINSRW;
5022    else if (VT == MVT::v4i16)
5023      Opc = X86ISD::MMX_PINSRW;
5024    else if (VT == MVT::v16i8)
5025      Opc = X86ISD::PINSRB;
5026    else
5027      Opc = X86ISD::PINSRB;
5028
5029    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
5030    // argument.
5031    if (N1.getValueType() != MVT::i32)
5032      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5033    if (N2.getValueType() != MVT::i32)
5034      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5035    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
5036  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
5037    // Bits [7:6] of the constant are the source select.  This will always be
5038    //  zero here.  The DAG Combiner may combine an extract_elt index into these
5039    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
5040    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
5041    // Bits [5:4] of the constant are the destination select.  This is the
5042    //  value of the incoming immediate.
5043    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
5044    //   combine either bitwise AND or insert of float 0.0 to set these bits.
5045    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
5046    // Create this as a scalar to vector..
5047    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
5048    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
5049  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
5050    // PINSR* works with constant index.
5051    return Op;
5052  }
5053  return SDValue();
5054}
5055
5056SDValue
5057X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
5058  EVT VT = Op.getValueType();
5059  EVT EltVT = VT.getVectorElementType();
5060
5061  if (Subtarget->hasSSE41())
5062    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
5063
5064  if (EltVT == MVT::i8)
5065    return SDValue();
5066
5067  DebugLoc dl = Op.getDebugLoc();
5068  SDValue N0 = Op.getOperand(0);
5069  SDValue N1 = Op.getOperand(1);
5070  SDValue N2 = Op.getOperand(2);
5071
5072  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
5073    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
5074    // as its second argument.
5075    if (N1.getValueType() != MVT::i32)
5076      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5077    if (N2.getValueType() != MVT::i32)
5078      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5079    return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
5080                       dl, VT, N0, N1, N2);
5081  }
5082  return SDValue();
5083}
5084
5085SDValue
5086X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5087  DebugLoc dl = Op.getDebugLoc();
5088
5089  if (Op.getValueType() == MVT::v1i64 &&
5090      Op.getOperand(0).getValueType() == MVT::i64)
5091    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
5092
5093  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
5094  EVT VT = MVT::v2i32;
5095  switch (Op.getValueType().getSimpleVT().SimpleTy) {
5096  default: break;
5097  case MVT::v16i8:
5098  case MVT::v8i16:
5099    VT = MVT::v4i32;
5100    break;
5101  }
5102  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
5103                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
5104}
5105
5106// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
5107// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
5108// one of the above mentioned nodes. It has to be wrapped because otherwise
5109// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
5110// be used to form addressing mode. These wrapped nodes will be selected
5111// into MOV32ri.
5112SDValue
5113X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
5114  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
5115
5116  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5117  // global base reg.
5118  unsigned char OpFlag = 0;
5119  unsigned WrapperKind = X86ISD::Wrapper;
5120  CodeModel::Model M = getTargetMachine().getCodeModel();
5121
5122  if (Subtarget->isPICStyleRIPRel() &&
5123      (M == CodeModel::Small || M == CodeModel::Kernel))
5124    WrapperKind = X86ISD::WrapperRIP;
5125  else if (Subtarget->isPICStyleGOT())
5126    OpFlag = X86II::MO_GOTOFF;
5127  else if (Subtarget->isPICStyleStubPIC())
5128    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5129
5130  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
5131                                             CP->getAlignment(),
5132                                             CP->getOffset(), OpFlag);
5133  DebugLoc DL = CP->getDebugLoc();
5134  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5135  // With PIC, the address is actually $g + Offset.
5136  if (OpFlag) {
5137    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5138                         DAG.getNode(X86ISD::GlobalBaseReg,
5139                                     DebugLoc(), getPointerTy()),
5140                         Result);
5141  }
5142
5143  return Result;
5144}
5145
5146SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
5147  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
5148
5149  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5150  // global base reg.
5151  unsigned char OpFlag = 0;
5152  unsigned WrapperKind = X86ISD::Wrapper;
5153  CodeModel::Model M = getTargetMachine().getCodeModel();
5154
5155  if (Subtarget->isPICStyleRIPRel() &&
5156      (M == CodeModel::Small || M == CodeModel::Kernel))
5157    WrapperKind = X86ISD::WrapperRIP;
5158  else if (Subtarget->isPICStyleGOT())
5159    OpFlag = X86II::MO_GOTOFF;
5160  else if (Subtarget->isPICStyleStubPIC())
5161    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5162
5163  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
5164                                          OpFlag);
5165  DebugLoc DL = JT->getDebugLoc();
5166  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5167
5168  // With PIC, the address is actually $g + Offset.
5169  if (OpFlag) {
5170    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5171                         DAG.getNode(X86ISD::GlobalBaseReg,
5172                                     DebugLoc(), getPointerTy()),
5173                         Result);
5174  }
5175
5176  return Result;
5177}
5178
5179SDValue
5180X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
5181  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
5182
5183  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5184  // global base reg.
5185  unsigned char OpFlag = 0;
5186  unsigned WrapperKind = X86ISD::Wrapper;
5187  CodeModel::Model M = getTargetMachine().getCodeModel();
5188
5189  if (Subtarget->isPICStyleRIPRel() &&
5190      (M == CodeModel::Small || M == CodeModel::Kernel))
5191    WrapperKind = X86ISD::WrapperRIP;
5192  else if (Subtarget->isPICStyleGOT())
5193    OpFlag = X86II::MO_GOTOFF;
5194  else if (Subtarget->isPICStyleStubPIC())
5195    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5196
5197  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
5198
5199  DebugLoc DL = Op.getDebugLoc();
5200  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5201
5202
5203  // With PIC, the address is actually $g + Offset.
5204  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
5205      !Subtarget->is64Bit()) {
5206    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5207                         DAG.getNode(X86ISD::GlobalBaseReg,
5208                                     DebugLoc(), getPointerTy()),
5209                         Result);
5210  }
5211
5212  return Result;
5213}
5214
5215SDValue
5216X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
5217  // Create the TargetBlockAddressAddress node.
5218  unsigned char OpFlags =
5219    Subtarget->ClassifyBlockAddressReference();
5220  CodeModel::Model M = getTargetMachine().getCodeModel();
5221  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
5222  DebugLoc dl = Op.getDebugLoc();
5223  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
5224                                       /*isTarget=*/true, OpFlags);
5225
5226  if (Subtarget->isPICStyleRIPRel() &&
5227      (M == CodeModel::Small || M == CodeModel::Kernel))
5228    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5229  else
5230    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5231
5232  // With PIC, the address is actually $g + Offset.
5233  if (isGlobalRelativeToPICBase(OpFlags)) {
5234    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5235                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5236                         Result);
5237  }
5238
5239  return Result;
5240}
5241
5242SDValue
5243X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
5244                                      int64_t Offset,
5245                                      SelectionDAG &DAG) const {
5246  // Create the TargetGlobalAddress node, folding in the constant
5247  // offset if it is legal.
5248  unsigned char OpFlags =
5249    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5250  CodeModel::Model M = getTargetMachine().getCodeModel();
5251  SDValue Result;
5252  if (OpFlags == X86II::MO_NO_FLAG &&
5253      X86::isOffsetSuitableForCodeModel(Offset, M)) {
5254    // A direct static reference to a global.
5255    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
5256    Offset = 0;
5257  } else {
5258    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
5259  }
5260
5261  if (Subtarget->isPICStyleRIPRel() &&
5262      (M == CodeModel::Small || M == CodeModel::Kernel))
5263    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5264  else
5265    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5266
5267  // With PIC, the address is actually $g + Offset.
5268  if (isGlobalRelativeToPICBase(OpFlags)) {
5269    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5270                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5271                         Result);
5272  }
5273
5274  // For globals that require a load from a stub to get the address, emit the
5275  // load.
5276  if (isGlobalStubReference(OpFlags))
5277    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
5278                         PseudoSourceValue::getGOT(), 0, false, false, 0);
5279
5280  // If there was a non-zero offset that we didn't fold, create an explicit
5281  // addition for it.
5282  if (Offset != 0)
5283    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
5284                         DAG.getConstant(Offset, getPointerTy()));
5285
5286  return Result;
5287}
5288
5289SDValue
5290X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
5291  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5292  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
5293  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
5294}
5295
5296static SDValue
5297GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
5298           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
5299           unsigned char OperandFlags) {
5300  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5301  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5302  DebugLoc dl = GA->getDebugLoc();
5303  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
5304                                           GA->getValueType(0),
5305                                           GA->getOffset(),
5306                                           OperandFlags);
5307  if (InFlag) {
5308    SDValue Ops[] = { Chain,  TGA, *InFlag };
5309    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
5310  } else {
5311    SDValue Ops[]  = { Chain, TGA };
5312    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
5313  }
5314
5315  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
5316  MFI->setAdjustsStack(true);
5317
5318  SDValue Flag = Chain.getValue(1);
5319  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
5320}
5321
5322// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
5323static SDValue
5324LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5325                                const EVT PtrVT) {
5326  SDValue InFlag;
5327  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
5328  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
5329                                     DAG.getNode(X86ISD::GlobalBaseReg,
5330                                                 DebugLoc(), PtrVT), InFlag);
5331  InFlag = Chain.getValue(1);
5332
5333  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
5334}
5335
5336// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
5337static SDValue
5338LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5339                                const EVT PtrVT) {
5340  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
5341                    X86::RAX, X86II::MO_TLSGD);
5342}
5343
5344// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
5345// "local exec" model.
5346static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5347                                   const EVT PtrVT, TLSModel::Model model,
5348                                   bool is64Bit) {
5349  DebugLoc dl = GA->getDebugLoc();
5350  // Get the Thread Pointer
5351  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
5352                             DebugLoc(), PtrVT,
5353                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
5354                                             MVT::i32));
5355
5356  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
5357                                      NULL, 0, false, false, 0);
5358
5359  unsigned char OperandFlags = 0;
5360  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
5361  // initialexec.
5362  unsigned WrapperKind = X86ISD::Wrapper;
5363  if (model == TLSModel::LocalExec) {
5364    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
5365  } else if (is64Bit) {
5366    assert(model == TLSModel::InitialExec);
5367    OperandFlags = X86II::MO_GOTTPOFF;
5368    WrapperKind = X86ISD::WrapperRIP;
5369  } else {
5370    assert(model == TLSModel::InitialExec);
5371    OperandFlags = X86II::MO_INDNTPOFF;
5372  }
5373
5374  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
5375  // exec)
5376  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
5377                                           GA->getValueType(0),
5378                                           GA->getOffset(), OperandFlags);
5379  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
5380
5381  if (model == TLSModel::InitialExec)
5382    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
5383                         PseudoSourceValue::getGOT(), 0, false, false, 0);
5384
5385  // The address of the thread local variable is the add of the thread
5386  // pointer with the offset of the variable.
5387  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
5388}
5389
5390SDValue
5391X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
5392
5393  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
5394  const GlobalValue *GV = GA->getGlobal();
5395
5396  if (Subtarget->isTargetELF()) {
5397    // TODO: implement the "local dynamic" model
5398    // TODO: implement the "initial exec"model for pic executables
5399
5400    // If GV is an alias then use the aliasee for determining
5401    // thread-localness.
5402    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
5403      GV = GA->resolveAliasedGlobal(false);
5404
5405    TLSModel::Model model
5406      = getTLSModel(GV, getTargetMachine().getRelocationModel());
5407
5408    switch (model) {
5409      case TLSModel::GeneralDynamic:
5410      case TLSModel::LocalDynamic: // not implemented
5411        if (Subtarget->is64Bit())
5412          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
5413        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
5414
5415      case TLSModel::InitialExec:
5416      case TLSModel::LocalExec:
5417        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
5418                                   Subtarget->is64Bit());
5419    }
5420  } else if (Subtarget->isTargetDarwin()) {
5421    // Darwin only has one model of TLS.  Lower to that.
5422    unsigned char OpFlag = 0;
5423    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
5424                           X86ISD::WrapperRIP : X86ISD::Wrapper;
5425
5426    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5427    // global base reg.
5428    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
5429                  !Subtarget->is64Bit();
5430    if (PIC32)
5431      OpFlag = X86II::MO_TLVP_PIC_BASE;
5432    else
5433      OpFlag = X86II::MO_TLVP;
5434    DebugLoc DL = Op.getDebugLoc();
5435    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
5436                                                getPointerTy(),
5437                                                GA->getOffset(), OpFlag);
5438    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5439
5440    // With PIC32, the address is actually $g + Offset.
5441    if (PIC32)
5442      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5443                           DAG.getNode(X86ISD::GlobalBaseReg,
5444                                       DebugLoc(), getPointerTy()),
5445                           Offset);
5446
5447    // Lowering the machine isd will make sure everything is in the right
5448    // location.
5449    SDValue Args[] = { Offset };
5450    SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1);
5451
5452    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
5453    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5454    MFI->setAdjustsStack(true);
5455
5456    // And our return value (tls address) is in the standard call return value
5457    // location.
5458    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
5459    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
5460  }
5461
5462  assert(false &&
5463         "TLS not implemented for this target.");
5464
5465  llvm_unreachable("Unreachable");
5466  return SDValue();
5467}
5468
5469
5470/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
5471/// take a 2 x i32 value to shift plus a shift amount.
5472SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
5473  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5474  EVT VT = Op.getValueType();
5475  unsigned VTBits = VT.getSizeInBits();
5476  DebugLoc dl = Op.getDebugLoc();
5477  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
5478  SDValue ShOpLo = Op.getOperand(0);
5479  SDValue ShOpHi = Op.getOperand(1);
5480  SDValue ShAmt  = Op.getOperand(2);
5481  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
5482                                     DAG.getConstant(VTBits - 1, MVT::i8))
5483                       : DAG.getConstant(0, VT);
5484
5485  SDValue Tmp2, Tmp3;
5486  if (Op.getOpcode() == ISD::SHL_PARTS) {
5487    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
5488    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5489  } else {
5490    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
5491    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
5492  }
5493
5494  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
5495                                DAG.getConstant(VTBits, MVT::i8));
5496  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
5497                             AndNode, DAG.getConstant(0, MVT::i8));
5498
5499  SDValue Hi, Lo;
5500  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5501  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
5502  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
5503
5504  if (Op.getOpcode() == ISD::SHL_PARTS) {
5505    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5506    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5507  } else {
5508    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5509    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5510  }
5511
5512  SDValue Ops[2] = { Lo, Hi };
5513  return DAG.getMergeValues(Ops, 2, dl);
5514}
5515
5516SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
5517                                           SelectionDAG &DAG) const {
5518  EVT SrcVT = Op.getOperand(0).getValueType();
5519
5520  if (SrcVT.isVector()) {
5521    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
5522      return Op;
5523    }
5524    return SDValue();
5525  }
5526
5527  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
5528         "Unknown SINT_TO_FP to lower!");
5529
5530  // These are really Legal; return the operand so the caller accepts it as
5531  // Legal.
5532  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
5533    return Op;
5534  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
5535      Subtarget->is64Bit()) {
5536    return Op;
5537  }
5538
5539  DebugLoc dl = Op.getDebugLoc();
5540  unsigned Size = SrcVT.getSizeInBits()/8;
5541  MachineFunction &MF = DAG.getMachineFunction();
5542  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
5543  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5544  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5545                               StackSlot,
5546                               PseudoSourceValue::getFixedStack(SSFI), 0,
5547                               false, false, 0);
5548  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
5549}
5550
5551SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
5552                                     SDValue StackSlot,
5553                                     SelectionDAG &DAG) const {
5554  // Build the FILD
5555  DebugLoc dl = Op.getDebugLoc();
5556  SDVTList Tys;
5557  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
5558  if (useSSE)
5559    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
5560  else
5561    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
5562  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
5563  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
5564                               Tys, Ops, array_lengthof(Ops));
5565
5566  if (useSSE) {
5567    Chain = Result.getValue(1);
5568    SDValue InFlag = Result.getValue(2);
5569
5570    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
5571    // shouldn't be necessary except that RFP cannot be live across
5572    // multiple blocks. When stackifier is fixed, they can be uncoupled.
5573    MachineFunction &MF = DAG.getMachineFunction();
5574    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
5575    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5576    Tys = DAG.getVTList(MVT::Other);
5577    SDValue Ops[] = {
5578      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
5579    };
5580    Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
5581    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
5582                         PseudoSourceValue::getFixedStack(SSFI), 0,
5583                         false, false, 0);
5584  }
5585
5586  return Result;
5587}
5588
5589// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
5590SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
5591                                               SelectionDAG &DAG) const {
5592  // This algorithm is not obvious. Here it is in C code, more or less:
5593  /*
5594    double uint64_to_double( uint32_t hi, uint32_t lo ) {
5595      static const __m128i exp = { 0x4330000045300000ULL, 0 };
5596      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
5597
5598      // Copy ints to xmm registers.
5599      __m128i xh = _mm_cvtsi32_si128( hi );
5600      __m128i xl = _mm_cvtsi32_si128( lo );
5601
5602      // Combine into low half of a single xmm register.
5603      __m128i x = _mm_unpacklo_epi32( xh, xl );
5604      __m128d d;
5605      double sd;
5606
5607      // Merge in appropriate exponents to give the integer bits the right
5608      // magnitude.
5609      x = _mm_unpacklo_epi32( x, exp );
5610
5611      // Subtract away the biases to deal with the IEEE-754 double precision
5612      // implicit 1.
5613      d = _mm_sub_pd( (__m128d) x, bias );
5614
5615      // All conversions up to here are exact. The correctly rounded result is
5616      // calculated using the current rounding mode using the following
5617      // horizontal add.
5618      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
5619      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
5620                                // store doesn't really need to be here (except
5621                                // maybe to zero the other double)
5622      return sd;
5623    }
5624  */
5625
5626  DebugLoc dl = Op.getDebugLoc();
5627  LLVMContext *Context = DAG.getContext();
5628
5629  // Build some magic constants.
5630  std::vector<Constant*> CV0;
5631  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
5632  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
5633  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5634  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5635  Constant *C0 = ConstantVector::get(CV0);
5636  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
5637
5638  std::vector<Constant*> CV1;
5639  CV1.push_back(
5640    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
5641  CV1.push_back(
5642    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
5643  Constant *C1 = ConstantVector::get(CV1);
5644  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
5645
5646  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5647                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5648                                        Op.getOperand(0),
5649                                        DAG.getIntPtrConstant(1)));
5650  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5651                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5652                                        Op.getOperand(0),
5653                                        DAG.getIntPtrConstant(0)));
5654  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
5655  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
5656                              PseudoSourceValue::getConstantPool(), 0,
5657                              false, false, 16);
5658  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
5659  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
5660  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
5661                              PseudoSourceValue::getConstantPool(), 0,
5662                              false, false, 16);
5663  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
5664
5665  // Add the halves; easiest way is to swap them into another reg first.
5666  int ShufMask[2] = { 1, -1 };
5667  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
5668                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
5669  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
5670  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
5671                     DAG.getIntPtrConstant(0));
5672}
5673
5674// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
5675SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
5676                                               SelectionDAG &DAG) const {
5677  DebugLoc dl = Op.getDebugLoc();
5678  // FP constant to bias correct the final result.
5679  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
5680                                   MVT::f64);
5681
5682  // Load the 32-bit value into an XMM register.
5683  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5684                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5685                                         Op.getOperand(0),
5686                                         DAG.getIntPtrConstant(0)));
5687
5688  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5689                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
5690                     DAG.getIntPtrConstant(0));
5691
5692  // Or the load with the bias.
5693  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
5694                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5695                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5696                                                   MVT::v2f64, Load)),
5697                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5698                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5699                                                   MVT::v2f64, Bias)));
5700  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5701                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
5702                   DAG.getIntPtrConstant(0));
5703
5704  // Subtract the bias.
5705  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
5706
5707  // Handle final rounding.
5708  EVT DestVT = Op.getValueType();
5709
5710  if (DestVT.bitsLT(MVT::f64)) {
5711    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5712                       DAG.getIntPtrConstant(0));
5713  } else if (DestVT.bitsGT(MVT::f64)) {
5714    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5715  }
5716
5717  // Handle final rounding.
5718  return Sub;
5719}
5720
5721SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
5722                                           SelectionDAG &DAG) const {
5723  SDValue N0 = Op.getOperand(0);
5724  DebugLoc dl = Op.getDebugLoc();
5725
5726  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
5727  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5728  // the optimization here.
5729  if (DAG.SignBitIsZero(N0))
5730    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5731
5732  EVT SrcVT = N0.getValueType();
5733  EVT DstVT = Op.getValueType();
5734  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
5735    return LowerUINT_TO_FP_i64(Op, DAG);
5736  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
5737    return LowerUINT_TO_FP_i32(Op, DAG);
5738
5739  // Make a 64-bit buffer, and use it to build an FILD.
5740  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5741  if (SrcVT == MVT::i32) {
5742    SDValue WordOff = DAG.getConstant(4, getPointerTy());
5743    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5744                                     getPointerTy(), StackSlot, WordOff);
5745    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5746                                  StackSlot, NULL, 0, false, false, 0);
5747    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5748                                  OffsetSlot, NULL, 0, false, false, 0);
5749    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5750    return Fild;
5751  }
5752
5753  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
5754  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5755                                StackSlot, NULL, 0, false, false, 0);
5756  // For i64 source, we need to add the appropriate power of 2 if the input
5757  // was negative.  This is the same as the optimization in
5758  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
5759  // we must be careful to do the computation in x87 extended precision, not
5760  // in SSE. (The generic code can't know it's OK to do this, or how to.)
5761  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
5762  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
5763  SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3);
5764
5765  APInt FF(32, 0x5F800000ULL);
5766
5767  // Check whether the sign bit is set.
5768  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
5769                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
5770                                 ISD::SETLT);
5771
5772  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
5773  SDValue FudgePtr = DAG.getConstantPool(
5774                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
5775                                         getPointerTy());
5776
5777  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
5778  SDValue Zero = DAG.getIntPtrConstant(0);
5779  SDValue Four = DAG.getIntPtrConstant(4);
5780  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
5781                               Zero, Four);
5782  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
5783
5784  // Load the value out, extending it from f32 to f80.
5785  // FIXME: Avoid the extend by constructing the right constant pool?
5786  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(),
5787                                 FudgePtr, PseudoSourceValue::getConstantPool(),
5788                                 0, MVT::f32, false, false, 4);
5789  // Extend everything to 80 bits to force it to be done on x87.
5790  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
5791  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
5792}
5793
5794std::pair<SDValue,SDValue> X86TargetLowering::
5795FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
5796  DebugLoc dl = Op.getDebugLoc();
5797
5798  EVT DstTy = Op.getValueType();
5799
5800  if (!IsSigned) {
5801    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5802    DstTy = MVT::i64;
5803  }
5804
5805  assert(DstTy.getSimpleVT() <= MVT::i64 &&
5806         DstTy.getSimpleVT() >= MVT::i16 &&
5807         "Unknown FP_TO_SINT to lower!");
5808
5809  // These are really Legal.
5810  if (DstTy == MVT::i32 &&
5811      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5812    return std::make_pair(SDValue(), SDValue());
5813  if (Subtarget->is64Bit() &&
5814      DstTy == MVT::i64 &&
5815      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5816    return std::make_pair(SDValue(), SDValue());
5817
5818  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5819  // stack slot.
5820  MachineFunction &MF = DAG.getMachineFunction();
5821  unsigned MemSize = DstTy.getSizeInBits()/8;
5822  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5823  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5824
5825  unsigned Opc;
5826  switch (DstTy.getSimpleVT().SimpleTy) {
5827  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5828  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5829  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5830  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5831  }
5832
5833  SDValue Chain = DAG.getEntryNode();
5834  SDValue Value = Op.getOperand(0);
5835  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5836    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5837    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5838                         PseudoSourceValue::getFixedStack(SSFI), 0,
5839                         false, false, 0);
5840    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5841    SDValue Ops[] = {
5842      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5843    };
5844    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5845    Chain = Value.getValue(1);
5846    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5847    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5848  }
5849
5850  // Build the FP_TO_INT*_IN_MEM
5851  SDValue Ops[] = { Chain, Value, StackSlot };
5852  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5853
5854  return std::make_pair(FIST, StackSlot);
5855}
5856
5857SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
5858                                           SelectionDAG &DAG) const {
5859  if (Op.getValueType().isVector()) {
5860    if (Op.getValueType() == MVT::v2i32 &&
5861        Op.getOperand(0).getValueType() == MVT::v2f64) {
5862      return Op;
5863    }
5864    return SDValue();
5865  }
5866
5867  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5868  SDValue FIST = Vals.first, StackSlot = Vals.second;
5869  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5870  if (FIST.getNode() == 0) return Op;
5871
5872  // Load the result.
5873  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5874                     FIST, StackSlot, NULL, 0, false, false, 0);
5875}
5876
5877SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
5878                                           SelectionDAG &DAG) const {
5879  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5880  SDValue FIST = Vals.first, StackSlot = Vals.second;
5881  assert(FIST.getNode() && "Unexpected failure");
5882
5883  // Load the result.
5884  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5885                     FIST, StackSlot, NULL, 0, false, false, 0);
5886}
5887
5888SDValue X86TargetLowering::LowerFABS(SDValue Op,
5889                                     SelectionDAG &DAG) const {
5890  LLVMContext *Context = DAG.getContext();
5891  DebugLoc dl = Op.getDebugLoc();
5892  EVT VT = Op.getValueType();
5893  EVT EltVT = VT;
5894  if (VT.isVector())
5895    EltVT = VT.getVectorElementType();
5896  std::vector<Constant*> CV;
5897  if (EltVT == MVT::f64) {
5898    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5899    CV.push_back(C);
5900    CV.push_back(C);
5901  } else {
5902    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5903    CV.push_back(C);
5904    CV.push_back(C);
5905    CV.push_back(C);
5906    CV.push_back(C);
5907  }
5908  Constant *C = ConstantVector::get(CV);
5909  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5910  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5911                             PseudoSourceValue::getConstantPool(), 0,
5912                             false, false, 16);
5913  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5914}
5915
5916SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
5917  LLVMContext *Context = DAG.getContext();
5918  DebugLoc dl = Op.getDebugLoc();
5919  EVT VT = Op.getValueType();
5920  EVT EltVT = VT;
5921  if (VT.isVector())
5922    EltVT = VT.getVectorElementType();
5923  std::vector<Constant*> CV;
5924  if (EltVT == MVT::f64) {
5925    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5926    CV.push_back(C);
5927    CV.push_back(C);
5928  } else {
5929    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5930    CV.push_back(C);
5931    CV.push_back(C);
5932    CV.push_back(C);
5933    CV.push_back(C);
5934  }
5935  Constant *C = ConstantVector::get(CV);
5936  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5937  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5938                             PseudoSourceValue::getConstantPool(), 0,
5939                             false, false, 16);
5940  if (VT.isVector()) {
5941    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5942                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5943                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5944                                Op.getOperand(0)),
5945                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5946  } else {
5947    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5948  }
5949}
5950
5951SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5952  LLVMContext *Context = DAG.getContext();
5953  SDValue Op0 = Op.getOperand(0);
5954  SDValue Op1 = Op.getOperand(1);
5955  DebugLoc dl = Op.getDebugLoc();
5956  EVT VT = Op.getValueType();
5957  EVT SrcVT = Op1.getValueType();
5958
5959  // If second operand is smaller, extend it first.
5960  if (SrcVT.bitsLT(VT)) {
5961    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5962    SrcVT = VT;
5963  }
5964  // And if it is bigger, shrink it first.
5965  if (SrcVT.bitsGT(VT)) {
5966    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5967    SrcVT = VT;
5968  }
5969
5970  // At this point the operands and the result should have the same
5971  // type, and that won't be f80 since that is not custom lowered.
5972
5973  // First get the sign bit of second operand.
5974  std::vector<Constant*> CV;
5975  if (SrcVT == MVT::f64) {
5976    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5977    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5978  } else {
5979    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5980    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5981    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5982    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5983  }
5984  Constant *C = ConstantVector::get(CV);
5985  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5986  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5987                              PseudoSourceValue::getConstantPool(), 0,
5988                              false, false, 16);
5989  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5990
5991  // Shift sign bit right or left if the two operands have different types.
5992  if (SrcVT.bitsGT(VT)) {
5993    // Op0 is MVT::f32, Op1 is MVT::f64.
5994    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5995    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5996                          DAG.getConstant(32, MVT::i32));
5997    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5998    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5999                          DAG.getIntPtrConstant(0));
6000  }
6001
6002  // Clear first operand sign bit.
6003  CV.clear();
6004  if (VT == MVT::f64) {
6005    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
6006    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
6007  } else {
6008    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
6009    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6010    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6011    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6012  }
6013  C = ConstantVector::get(CV);
6014  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
6015  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
6016                              PseudoSourceValue::getConstantPool(), 0,
6017                              false, false, 16);
6018  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
6019
6020  // Or the value with the sign bit.
6021  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
6022}
6023
6024/// Emit nodes that will be selected as "test Op0,Op0", or something
6025/// equivalent.
6026SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
6027                                    SelectionDAG &DAG) const {
6028  DebugLoc dl = Op.getDebugLoc();
6029
6030  // CF and OF aren't always set the way we want. Determine which
6031  // of these we need.
6032  bool NeedCF = false;
6033  bool NeedOF = false;
6034  switch (X86CC) {
6035  default: break;
6036  case X86::COND_A: case X86::COND_AE:
6037  case X86::COND_B: case X86::COND_BE:
6038    NeedCF = true;
6039    break;
6040  case X86::COND_G: case X86::COND_GE:
6041  case X86::COND_L: case X86::COND_LE:
6042  case X86::COND_O: case X86::COND_NO:
6043    NeedOF = true;
6044    break;
6045  }
6046
6047  // See if we can use the EFLAGS value from the operand instead of
6048  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
6049  // we prove that the arithmetic won't overflow, we can't use OF or CF.
6050  if (Op.getResNo() != 0 || NeedOF || NeedCF)
6051    // Emit a CMP with 0, which is the TEST pattern.
6052    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
6053                       DAG.getConstant(0, Op.getValueType()));
6054
6055  unsigned Opcode = 0;
6056  unsigned NumOperands = 0;
6057  switch (Op.getNode()->getOpcode()) {
6058  case ISD::ADD:
6059    // Due to an isel shortcoming, be conservative if this add is likely to be
6060    // selected as part of a load-modify-store instruction. When the root node
6061    // in a match is a store, isel doesn't know how to remap non-chain non-flag
6062    // uses of other nodes in the match, such as the ADD in this case. This
6063    // leads to the ADD being left around and reselected, with the result being
6064    // two adds in the output.  Alas, even if none our users are stores, that
6065    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
6066    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
6067    // climbing the DAG back to the root, and it doesn't seem to be worth the
6068    // effort.
6069    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6070           UE = Op.getNode()->use_end(); UI != UE; ++UI)
6071      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
6072        goto default_case;
6073
6074    if (ConstantSDNode *C =
6075        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
6076      // An add of one will be selected as an INC.
6077      if (C->getAPIntValue() == 1) {
6078        Opcode = X86ISD::INC;
6079        NumOperands = 1;
6080        break;
6081      }
6082
6083      // An add of negative one (subtract of one) will be selected as a DEC.
6084      if (C->getAPIntValue().isAllOnesValue()) {
6085        Opcode = X86ISD::DEC;
6086        NumOperands = 1;
6087        break;
6088      }
6089    }
6090
6091    // Otherwise use a regular EFLAGS-setting add.
6092    Opcode = X86ISD::ADD;
6093    NumOperands = 2;
6094    break;
6095  case ISD::AND: {
6096    // If the primary and result isn't used, don't bother using X86ISD::AND,
6097    // because a TEST instruction will be better.
6098    bool NonFlagUse = false;
6099    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6100           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
6101      SDNode *User = *UI;
6102      unsigned UOpNo = UI.getOperandNo();
6103      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
6104        // Look pass truncate.
6105        UOpNo = User->use_begin().getOperandNo();
6106        User = *User->use_begin();
6107      }
6108
6109      if (User->getOpcode() != ISD::BRCOND &&
6110          User->getOpcode() != ISD::SETCC &&
6111          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
6112        NonFlagUse = true;
6113        break;
6114      }
6115    }
6116
6117    if (!NonFlagUse)
6118      break;
6119  }
6120    // FALL THROUGH
6121  case ISD::SUB:
6122  case ISD::OR:
6123  case ISD::XOR:
6124    // Due to the ISEL shortcoming noted above, be conservative if this op is
6125    // likely to be selected as part of a load-modify-store instruction.
6126    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6127           UE = Op.getNode()->use_end(); UI != UE; ++UI)
6128      if (UI->getOpcode() == ISD::STORE)
6129        goto default_case;
6130
6131    // Otherwise use a regular EFLAGS-setting instruction.
6132    switch (Op.getNode()->getOpcode()) {
6133    default: llvm_unreachable("unexpected operator!");
6134    case ISD::SUB: Opcode = X86ISD::SUB; break;
6135    case ISD::OR:  Opcode = X86ISD::OR;  break;
6136    case ISD::XOR: Opcode = X86ISD::XOR; break;
6137    case ISD::AND: Opcode = X86ISD::AND; break;
6138    }
6139
6140    NumOperands = 2;
6141    break;
6142  case X86ISD::ADD:
6143  case X86ISD::SUB:
6144  case X86ISD::INC:
6145  case X86ISD::DEC:
6146  case X86ISD::OR:
6147  case X86ISD::XOR:
6148  case X86ISD::AND:
6149    return SDValue(Op.getNode(), 1);
6150  default:
6151  default_case:
6152    break;
6153  }
6154
6155  if (Opcode == 0)
6156    // Emit a CMP with 0, which is the TEST pattern.
6157    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
6158                       DAG.getConstant(0, Op.getValueType()));
6159
6160  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
6161  SmallVector<SDValue, 4> Ops;
6162  for (unsigned i = 0; i != NumOperands; ++i)
6163    Ops.push_back(Op.getOperand(i));
6164
6165  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
6166  DAG.ReplaceAllUsesWith(Op, New);
6167  return SDValue(New.getNode(), 1);
6168}
6169
6170/// Emit nodes that will be selected as "cmp Op0,Op1", or something
6171/// equivalent.
6172SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
6173                                   SelectionDAG &DAG) const {
6174  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
6175    if (C->getAPIntValue() == 0)
6176      return EmitTest(Op0, X86CC, DAG);
6177
6178  DebugLoc dl = Op0.getDebugLoc();
6179  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
6180}
6181
6182/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
6183/// if it's possible.
6184SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
6185                                     DebugLoc dl, SelectionDAG &DAG) const {
6186  SDValue Op0 = And.getOperand(0);
6187  SDValue Op1 = And.getOperand(1);
6188  if (Op0.getOpcode() == ISD::TRUNCATE)
6189    Op0 = Op0.getOperand(0);
6190  if (Op1.getOpcode() == ISD::TRUNCATE)
6191    Op1 = Op1.getOperand(0);
6192
6193  SDValue LHS, RHS;
6194  if (Op1.getOpcode() == ISD::SHL)
6195    std::swap(Op0, Op1);
6196  if (Op0.getOpcode() == ISD::SHL) {
6197    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
6198      if (And00C->getZExtValue() == 1) {
6199        // If we looked past a truncate, check that it's only truncating away
6200        // known zeros.
6201        unsigned BitWidth = Op0.getValueSizeInBits();
6202        unsigned AndBitWidth = And.getValueSizeInBits();
6203        if (BitWidth > AndBitWidth) {
6204          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
6205          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
6206          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
6207            return SDValue();
6208        }
6209        LHS = Op1;
6210        RHS = Op0.getOperand(1);
6211      }
6212  } else if (Op1.getOpcode() == ISD::Constant) {
6213    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
6214    SDValue AndLHS = Op0;
6215    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
6216      LHS = AndLHS.getOperand(0);
6217      RHS = AndLHS.getOperand(1);
6218    }
6219  }
6220
6221  if (LHS.getNode()) {
6222    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
6223    // instruction.  Since the shift amount is in-range-or-undefined, we know
6224    // that doing a bittest on the i32 value is ok.  We extend to i32 because
6225    // the encoding for the i16 version is larger than the i32 version.
6226    // Also promote i16 to i32 for performance / code size reason.
6227    if (LHS.getValueType() == MVT::i8 ||
6228        LHS.getValueType() == MVT::i16)
6229      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
6230
6231    // If the operand types disagree, extend the shift amount to match.  Since
6232    // BT ignores high bits (like shifts) we can use anyextend.
6233    if (LHS.getValueType() != RHS.getValueType())
6234      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
6235
6236    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
6237    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
6238    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6239                       DAG.getConstant(Cond, MVT::i8), BT);
6240  }
6241
6242  return SDValue();
6243}
6244
6245SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
6246  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
6247  SDValue Op0 = Op.getOperand(0);
6248  SDValue Op1 = Op.getOperand(1);
6249  DebugLoc dl = Op.getDebugLoc();
6250  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6251
6252  // Optimize to BT if possible.
6253  // Lower (X & (1 << N)) == 0 to BT(X, N).
6254  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
6255  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
6256  if (Op0.getOpcode() == ISD::AND &&
6257      Op0.hasOneUse() &&
6258      Op1.getOpcode() == ISD::Constant &&
6259      cast<ConstantSDNode>(Op1)->isNullValue() &&
6260      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6261    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
6262    if (NewSetCC.getNode())
6263      return NewSetCC;
6264  }
6265
6266  // Look for "(setcc) == / != 1" to avoid unncessary setcc.
6267  if (Op0.getOpcode() == X86ISD::SETCC &&
6268      Op1.getOpcode() == ISD::Constant &&
6269      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
6270       cast<ConstantSDNode>(Op1)->isNullValue()) &&
6271      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6272    X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
6273    bool Invert = (CC == ISD::SETNE) ^
6274      cast<ConstantSDNode>(Op1)->isNullValue();
6275    if (Invert)
6276      CCode = X86::GetOppositeBranchCondition(CCode);
6277    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6278                       DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
6279  }
6280
6281  bool isFP = Op1.getValueType().isFloatingPoint();
6282  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
6283  if (X86CC == X86::COND_INVALID)
6284    return SDValue();
6285
6286  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
6287
6288  // Use sbb x, x to materialize carry bit into a GPR.
6289  if (X86CC == X86::COND_B)
6290    return DAG.getNode(ISD::AND, dl, MVT::i8,
6291                       DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
6292                                   DAG.getConstant(X86CC, MVT::i8), Cond),
6293                       DAG.getConstant(1, MVT::i8));
6294
6295  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6296                     DAG.getConstant(X86CC, MVT::i8), Cond);
6297}
6298
6299SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
6300  SDValue Cond;
6301  SDValue Op0 = Op.getOperand(0);
6302  SDValue Op1 = Op.getOperand(1);
6303  SDValue CC = Op.getOperand(2);
6304  EVT VT = Op.getValueType();
6305  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6306  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
6307  DebugLoc dl = Op.getDebugLoc();
6308
6309  if (isFP) {
6310    unsigned SSECC = 8;
6311    EVT VT0 = Op0.getValueType();
6312    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
6313    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
6314    bool Swap = false;
6315
6316    switch (SetCCOpcode) {
6317    default: break;
6318    case ISD::SETOEQ:
6319    case ISD::SETEQ:  SSECC = 0; break;
6320    case ISD::SETOGT:
6321    case ISD::SETGT: Swap = true; // Fallthrough
6322    case ISD::SETLT:
6323    case ISD::SETOLT: SSECC = 1; break;
6324    case ISD::SETOGE:
6325    case ISD::SETGE: Swap = true; // Fallthrough
6326    case ISD::SETLE:
6327    case ISD::SETOLE: SSECC = 2; break;
6328    case ISD::SETUO:  SSECC = 3; break;
6329    case ISD::SETUNE:
6330    case ISD::SETNE:  SSECC = 4; break;
6331    case ISD::SETULE: Swap = true;
6332    case ISD::SETUGE: SSECC = 5; break;
6333    case ISD::SETULT: Swap = true;
6334    case ISD::SETUGT: SSECC = 6; break;
6335    case ISD::SETO:   SSECC = 7; break;
6336    }
6337    if (Swap)
6338      std::swap(Op0, Op1);
6339
6340    // In the two special cases we can't handle, emit two comparisons.
6341    if (SSECC == 8) {
6342      if (SetCCOpcode == ISD::SETUEQ) {
6343        SDValue UNORD, EQ;
6344        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
6345        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
6346        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
6347      }
6348      else if (SetCCOpcode == ISD::SETONE) {
6349        SDValue ORD, NEQ;
6350        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
6351        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
6352        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
6353      }
6354      llvm_unreachable("Illegal FP comparison");
6355    }
6356    // Handle all other FP comparisons here.
6357    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
6358  }
6359
6360  // We are handling one of the integer comparisons here.  Since SSE only has
6361  // GT and EQ comparisons for integer, swapping operands and multiple
6362  // operations may be required for some comparisons.
6363  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
6364  bool Swap = false, Invert = false, FlipSigns = false;
6365
6366  switch (VT.getSimpleVT().SimpleTy) {
6367  default: break;
6368  case MVT::v8i8:
6369  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
6370  case MVT::v4i16:
6371  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
6372  case MVT::v2i32:
6373  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
6374  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
6375  }
6376
6377  switch (SetCCOpcode) {
6378  default: break;
6379  case ISD::SETNE:  Invert = true;
6380  case ISD::SETEQ:  Opc = EQOpc; break;
6381  case ISD::SETLT:  Swap = true;
6382  case ISD::SETGT:  Opc = GTOpc; break;
6383  case ISD::SETGE:  Swap = true;
6384  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
6385  case ISD::SETULT: Swap = true;
6386  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
6387  case ISD::SETUGE: Swap = true;
6388  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
6389  }
6390  if (Swap)
6391    std::swap(Op0, Op1);
6392
6393  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
6394  // bits of the inputs before performing those operations.
6395  if (FlipSigns) {
6396    EVT EltVT = VT.getVectorElementType();
6397    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
6398                                      EltVT);
6399    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
6400    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
6401                                    SignBits.size());
6402    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
6403    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
6404  }
6405
6406  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
6407
6408  // If the logical-not of the result is required, perform that now.
6409  if (Invert)
6410    Result = DAG.getNOT(dl, Result, VT);
6411
6412  return Result;
6413}
6414
6415// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
6416static bool isX86LogicalCmp(SDValue Op) {
6417  unsigned Opc = Op.getNode()->getOpcode();
6418  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
6419    return true;
6420  if (Op.getResNo() == 1 &&
6421      (Opc == X86ISD::ADD ||
6422       Opc == X86ISD::SUB ||
6423       Opc == X86ISD::SMUL ||
6424       Opc == X86ISD::UMUL ||
6425       Opc == X86ISD::INC ||
6426       Opc == X86ISD::DEC ||
6427       Opc == X86ISD::OR ||
6428       Opc == X86ISD::XOR ||
6429       Opc == X86ISD::AND))
6430    return true;
6431
6432  return false;
6433}
6434
6435SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
6436  bool addTest = true;
6437  SDValue Cond  = Op.getOperand(0);
6438  DebugLoc dl = Op.getDebugLoc();
6439  SDValue CC;
6440
6441  if (Cond.getOpcode() == ISD::SETCC) {
6442    SDValue NewCond = LowerSETCC(Cond, DAG);
6443    if (NewCond.getNode())
6444      Cond = NewCond;
6445  }
6446
6447  // (select (x == 0), -1, 0) -> (sign_bit (x - 1))
6448  SDValue Op1 = Op.getOperand(1);
6449  SDValue Op2 = Op.getOperand(2);
6450  if (Cond.getOpcode() == X86ISD::SETCC &&
6451      cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
6452    SDValue Cmp = Cond.getOperand(1);
6453    if (Cmp.getOpcode() == X86ISD::CMP) {
6454      ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
6455      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
6456      ConstantSDNode *RHSC =
6457        dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
6458      if (N1C && N1C->isAllOnesValue() &&
6459          N2C && N2C->isNullValue() &&
6460          RHSC && RHSC->isNullValue()) {
6461        SDValue CmpOp0 = Cmp.getOperand(0);
6462        Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
6463                          CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
6464        return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
6465                           DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
6466      }
6467    }
6468  }
6469
6470  // Look pass (and (setcc_carry (cmp ...)), 1).
6471  if (Cond.getOpcode() == ISD::AND &&
6472      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6473    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6474    if (C && C->getAPIntValue() == 1)
6475      Cond = Cond.getOperand(0);
6476  }
6477
6478  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6479  // setting operand in place of the X86ISD::SETCC.
6480  if (Cond.getOpcode() == X86ISD::SETCC ||
6481      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6482    CC = Cond.getOperand(0);
6483
6484    SDValue Cmp = Cond.getOperand(1);
6485    unsigned Opc = Cmp.getOpcode();
6486    EVT VT = Op.getValueType();
6487
6488    bool IllegalFPCMov = false;
6489    if (VT.isFloatingPoint() && !VT.isVector() &&
6490        !isScalarFPTypeInSSEReg(VT))  // FPStack?
6491      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
6492
6493    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
6494        Opc == X86ISD::BT) { // FIXME
6495      Cond = Cmp;
6496      addTest = false;
6497    }
6498  }
6499
6500  if (addTest) {
6501    // Look pass the truncate.
6502    if (Cond.getOpcode() == ISD::TRUNCATE)
6503      Cond = Cond.getOperand(0);
6504
6505    // We know the result of AND is compared against zero. Try to match
6506    // it to BT.
6507    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6508      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6509      if (NewSetCC.getNode()) {
6510        CC = NewSetCC.getOperand(0);
6511        Cond = NewSetCC.getOperand(1);
6512        addTest = false;
6513      }
6514    }
6515  }
6516
6517  if (addTest) {
6518    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6519    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6520  }
6521
6522  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
6523  // condition is true.
6524  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
6525  SDValue Ops[] = { Op2, Op1, CC, Cond };
6526  return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
6527}
6528
6529// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
6530// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
6531// from the AND / OR.
6532static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
6533  Opc = Op.getOpcode();
6534  if (Opc != ISD::OR && Opc != ISD::AND)
6535    return false;
6536  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6537          Op.getOperand(0).hasOneUse() &&
6538          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
6539          Op.getOperand(1).hasOneUse());
6540}
6541
6542// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
6543// 1 and that the SETCC node has a single use.
6544static bool isXor1OfSetCC(SDValue Op) {
6545  if (Op.getOpcode() != ISD::XOR)
6546    return false;
6547  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6548  if (N1C && N1C->getAPIntValue() == 1) {
6549    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6550      Op.getOperand(0).hasOneUse();
6551  }
6552  return false;
6553}
6554
6555SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
6556  bool addTest = true;
6557  SDValue Chain = Op.getOperand(0);
6558  SDValue Cond  = Op.getOperand(1);
6559  SDValue Dest  = Op.getOperand(2);
6560  DebugLoc dl = Op.getDebugLoc();
6561  SDValue CC;
6562
6563  if (Cond.getOpcode() == ISD::SETCC) {
6564    SDValue NewCond = LowerSETCC(Cond, DAG);
6565    if (NewCond.getNode())
6566      Cond = NewCond;
6567  }
6568#if 0
6569  // FIXME: LowerXALUO doesn't handle these!!
6570  else if (Cond.getOpcode() == X86ISD::ADD  ||
6571           Cond.getOpcode() == X86ISD::SUB  ||
6572           Cond.getOpcode() == X86ISD::SMUL ||
6573           Cond.getOpcode() == X86ISD::UMUL)
6574    Cond = LowerXALUO(Cond, DAG);
6575#endif
6576
6577  // Look pass (and (setcc_carry (cmp ...)), 1).
6578  if (Cond.getOpcode() == ISD::AND &&
6579      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6580    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6581    if (C && C->getAPIntValue() == 1)
6582      Cond = Cond.getOperand(0);
6583  }
6584
6585  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6586  // setting operand in place of the X86ISD::SETCC.
6587  if (Cond.getOpcode() == X86ISD::SETCC ||
6588      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6589    CC = Cond.getOperand(0);
6590
6591    SDValue Cmp = Cond.getOperand(1);
6592    unsigned Opc = Cmp.getOpcode();
6593    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
6594    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
6595      Cond = Cmp;
6596      addTest = false;
6597    } else {
6598      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
6599      default: break;
6600      case X86::COND_O:
6601      case X86::COND_B:
6602        // These can only come from an arithmetic instruction with overflow,
6603        // e.g. SADDO, UADDO.
6604        Cond = Cond.getNode()->getOperand(1);
6605        addTest = false;
6606        break;
6607      }
6608    }
6609  } else {
6610    unsigned CondOpc;
6611    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
6612      SDValue Cmp = Cond.getOperand(0).getOperand(1);
6613      if (CondOpc == ISD::OR) {
6614        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
6615        // two branches instead of an explicit OR instruction with a
6616        // separate test.
6617        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6618            isX86LogicalCmp(Cmp)) {
6619          CC = Cond.getOperand(0).getOperand(0);
6620          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6621                              Chain, Dest, CC, Cmp);
6622          CC = Cond.getOperand(1).getOperand(0);
6623          Cond = Cmp;
6624          addTest = false;
6625        }
6626      } else { // ISD::AND
6627        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
6628        // two branches instead of an explicit AND instruction with a
6629        // separate test. However, we only do this if this block doesn't
6630        // have a fall-through edge, because this requires an explicit
6631        // jmp when the condition is false.
6632        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6633            isX86LogicalCmp(Cmp) &&
6634            Op.getNode()->hasOneUse()) {
6635          X86::CondCode CCode =
6636            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6637          CCode = X86::GetOppositeBranchCondition(CCode);
6638          CC = DAG.getConstant(CCode, MVT::i8);
6639          SDNode *User = *Op.getNode()->use_begin();
6640          // Look for an unconditional branch following this conditional branch.
6641          // We need this because we need to reverse the successors in order
6642          // to implement FCMP_OEQ.
6643          if (User->getOpcode() == ISD::BR) {
6644            SDValue FalseBB = User->getOperand(1);
6645            SDNode *NewBR =
6646              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
6647            assert(NewBR == User);
6648            (void)NewBR;
6649            Dest = FalseBB;
6650
6651            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6652                                Chain, Dest, CC, Cmp);
6653            X86::CondCode CCode =
6654              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
6655            CCode = X86::GetOppositeBranchCondition(CCode);
6656            CC = DAG.getConstant(CCode, MVT::i8);
6657            Cond = Cmp;
6658            addTest = false;
6659          }
6660        }
6661      }
6662    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
6663      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
6664      // It should be transformed during dag combiner except when the condition
6665      // is set by a arithmetics with overflow node.
6666      X86::CondCode CCode =
6667        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6668      CCode = X86::GetOppositeBranchCondition(CCode);
6669      CC = DAG.getConstant(CCode, MVT::i8);
6670      Cond = Cond.getOperand(0).getOperand(1);
6671      addTest = false;
6672    }
6673  }
6674
6675  if (addTest) {
6676    // Look pass the truncate.
6677    if (Cond.getOpcode() == ISD::TRUNCATE)
6678      Cond = Cond.getOperand(0);
6679
6680    // We know the result of AND is compared against zero. Try to match
6681    // it to BT.
6682    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6683      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6684      if (NewSetCC.getNode()) {
6685        CC = NewSetCC.getOperand(0);
6686        Cond = NewSetCC.getOperand(1);
6687        addTest = false;
6688      }
6689    }
6690  }
6691
6692  if (addTest) {
6693    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6694    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6695  }
6696  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6697                     Chain, Dest, CC, Cond);
6698}
6699
6700
6701// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
6702// Calls to _alloca is needed to probe the stack when allocating more than 4k
6703// bytes in one go. Touching the stack at 4K increments is necessary to ensure
6704// that the guard pages used by the OS virtual memory manager are allocated in
6705// correct sequence.
6706SDValue
6707X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
6708                                           SelectionDAG &DAG) const {
6709  assert(Subtarget->isTargetCygMing() &&
6710         "This should be used only on Cygwin/Mingw targets");
6711  DebugLoc dl = Op.getDebugLoc();
6712
6713  // Get the inputs.
6714  SDValue Chain = Op.getOperand(0);
6715  SDValue Size  = Op.getOperand(1);
6716  // FIXME: Ensure alignment here
6717
6718  SDValue Flag;
6719
6720  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
6721
6722  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
6723  Flag = Chain.getValue(1);
6724
6725  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
6726
6727  Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag);
6728  Flag = Chain.getValue(1);
6729
6730  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
6731
6732  SDValue Ops1[2] = { Chain.getValue(0), Chain };
6733  return DAG.getMergeValues(Ops1, 2, dl);
6734}
6735
6736SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
6737  MachineFunction &MF = DAG.getMachineFunction();
6738  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
6739
6740  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6741  DebugLoc dl = Op.getDebugLoc();
6742
6743  if (!Subtarget->is64Bit()) {
6744    // vastart just stores the address of the VarArgsFrameIndex slot into the
6745    // memory location argument.
6746    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
6747                                   getPointerTy());
6748    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
6749                        false, false, 0);
6750  }
6751
6752  // __va_list_tag:
6753  //   gp_offset         (0 - 6 * 8)
6754  //   fp_offset         (48 - 48 + 8 * 16)
6755  //   overflow_arg_area (point to parameters coming in memory).
6756  //   reg_save_area
6757  SmallVector<SDValue, 8> MemOps;
6758  SDValue FIN = Op.getOperand(1);
6759  // Store gp_offset
6760  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6761                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
6762                                               MVT::i32),
6763                               FIN, SV, 0, false, false, 0);
6764  MemOps.push_back(Store);
6765
6766  // Store fp_offset
6767  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6768                    FIN, DAG.getIntPtrConstant(4));
6769  Store = DAG.getStore(Op.getOperand(0), dl,
6770                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
6771                                       MVT::i32),
6772                       FIN, SV, 4, false, false, 0);
6773  MemOps.push_back(Store);
6774
6775  // Store ptr to overflow_arg_area
6776  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6777                    FIN, DAG.getIntPtrConstant(4));
6778  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
6779                                    getPointerTy());
6780  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8,
6781                       false, false, 0);
6782  MemOps.push_back(Store);
6783
6784  // Store ptr to reg_save_area.
6785  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6786                    FIN, DAG.getIntPtrConstant(8));
6787  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
6788                                    getPointerTy());
6789  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16,
6790                       false, false, 0);
6791  MemOps.push_back(Store);
6792  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6793                     &MemOps[0], MemOps.size());
6794}
6795
6796SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
6797  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6798  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6799
6800  report_fatal_error("VAArgInst is not yet implemented for x86-64!");
6801  return SDValue();
6802}
6803
6804SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
6805  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6806  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6807  SDValue Chain = Op.getOperand(0);
6808  SDValue DstPtr = Op.getOperand(1);
6809  SDValue SrcPtr = Op.getOperand(2);
6810  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6811  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6812  DebugLoc dl = Op.getDebugLoc();
6813
6814  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6815                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
6816                       false, DstSV, 0, SrcSV, 0);
6817}
6818
6819SDValue
6820X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
6821  DebugLoc dl = Op.getDebugLoc();
6822  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6823  switch (IntNo) {
6824  default: return SDValue();    // Don't custom lower most intrinsics.
6825  // Comparison intrinsics.
6826  case Intrinsic::x86_sse_comieq_ss:
6827  case Intrinsic::x86_sse_comilt_ss:
6828  case Intrinsic::x86_sse_comile_ss:
6829  case Intrinsic::x86_sse_comigt_ss:
6830  case Intrinsic::x86_sse_comige_ss:
6831  case Intrinsic::x86_sse_comineq_ss:
6832  case Intrinsic::x86_sse_ucomieq_ss:
6833  case Intrinsic::x86_sse_ucomilt_ss:
6834  case Intrinsic::x86_sse_ucomile_ss:
6835  case Intrinsic::x86_sse_ucomigt_ss:
6836  case Intrinsic::x86_sse_ucomige_ss:
6837  case Intrinsic::x86_sse_ucomineq_ss:
6838  case Intrinsic::x86_sse2_comieq_sd:
6839  case Intrinsic::x86_sse2_comilt_sd:
6840  case Intrinsic::x86_sse2_comile_sd:
6841  case Intrinsic::x86_sse2_comigt_sd:
6842  case Intrinsic::x86_sse2_comige_sd:
6843  case Intrinsic::x86_sse2_comineq_sd:
6844  case Intrinsic::x86_sse2_ucomieq_sd:
6845  case Intrinsic::x86_sse2_ucomilt_sd:
6846  case Intrinsic::x86_sse2_ucomile_sd:
6847  case Intrinsic::x86_sse2_ucomigt_sd:
6848  case Intrinsic::x86_sse2_ucomige_sd:
6849  case Intrinsic::x86_sse2_ucomineq_sd: {
6850    unsigned Opc = 0;
6851    ISD::CondCode CC = ISD::SETCC_INVALID;
6852    switch (IntNo) {
6853    default: break;
6854    case Intrinsic::x86_sse_comieq_ss:
6855    case Intrinsic::x86_sse2_comieq_sd:
6856      Opc = X86ISD::COMI;
6857      CC = ISD::SETEQ;
6858      break;
6859    case Intrinsic::x86_sse_comilt_ss:
6860    case Intrinsic::x86_sse2_comilt_sd:
6861      Opc = X86ISD::COMI;
6862      CC = ISD::SETLT;
6863      break;
6864    case Intrinsic::x86_sse_comile_ss:
6865    case Intrinsic::x86_sse2_comile_sd:
6866      Opc = X86ISD::COMI;
6867      CC = ISD::SETLE;
6868      break;
6869    case Intrinsic::x86_sse_comigt_ss:
6870    case Intrinsic::x86_sse2_comigt_sd:
6871      Opc = X86ISD::COMI;
6872      CC = ISD::SETGT;
6873      break;
6874    case Intrinsic::x86_sse_comige_ss:
6875    case Intrinsic::x86_sse2_comige_sd:
6876      Opc = X86ISD::COMI;
6877      CC = ISD::SETGE;
6878      break;
6879    case Intrinsic::x86_sse_comineq_ss:
6880    case Intrinsic::x86_sse2_comineq_sd:
6881      Opc = X86ISD::COMI;
6882      CC = ISD::SETNE;
6883      break;
6884    case Intrinsic::x86_sse_ucomieq_ss:
6885    case Intrinsic::x86_sse2_ucomieq_sd:
6886      Opc = X86ISD::UCOMI;
6887      CC = ISD::SETEQ;
6888      break;
6889    case Intrinsic::x86_sse_ucomilt_ss:
6890    case Intrinsic::x86_sse2_ucomilt_sd:
6891      Opc = X86ISD::UCOMI;
6892      CC = ISD::SETLT;
6893      break;
6894    case Intrinsic::x86_sse_ucomile_ss:
6895    case Intrinsic::x86_sse2_ucomile_sd:
6896      Opc = X86ISD::UCOMI;
6897      CC = ISD::SETLE;
6898      break;
6899    case Intrinsic::x86_sse_ucomigt_ss:
6900    case Intrinsic::x86_sse2_ucomigt_sd:
6901      Opc = X86ISD::UCOMI;
6902      CC = ISD::SETGT;
6903      break;
6904    case Intrinsic::x86_sse_ucomige_ss:
6905    case Intrinsic::x86_sse2_ucomige_sd:
6906      Opc = X86ISD::UCOMI;
6907      CC = ISD::SETGE;
6908      break;
6909    case Intrinsic::x86_sse_ucomineq_ss:
6910    case Intrinsic::x86_sse2_ucomineq_sd:
6911      Opc = X86ISD::UCOMI;
6912      CC = ISD::SETNE;
6913      break;
6914    }
6915
6916    SDValue LHS = Op.getOperand(1);
6917    SDValue RHS = Op.getOperand(2);
6918    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6919    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
6920    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6921    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6922                                DAG.getConstant(X86CC, MVT::i8), Cond);
6923    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6924  }
6925  // ptest intrinsics. The intrinsic these come from are designed to return
6926  // an integer value, not just an instruction so lower it to the ptest
6927  // pattern and a setcc for the result.
6928  case Intrinsic::x86_sse41_ptestz:
6929  case Intrinsic::x86_sse41_ptestc:
6930  case Intrinsic::x86_sse41_ptestnzc:{
6931    unsigned X86CC = 0;
6932    switch (IntNo) {
6933    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6934    case Intrinsic::x86_sse41_ptestz:
6935      // ZF = 1
6936      X86CC = X86::COND_E;
6937      break;
6938    case Intrinsic::x86_sse41_ptestc:
6939      // CF = 1
6940      X86CC = X86::COND_B;
6941      break;
6942    case Intrinsic::x86_sse41_ptestnzc:
6943      // ZF and CF = 0
6944      X86CC = X86::COND_A;
6945      break;
6946    }
6947
6948    SDValue LHS = Op.getOperand(1);
6949    SDValue RHS = Op.getOperand(2);
6950    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6951    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6952    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6953    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6954  }
6955
6956  // Fix vector shift instructions where the last operand is a non-immediate
6957  // i32 value.
6958  case Intrinsic::x86_sse2_pslli_w:
6959  case Intrinsic::x86_sse2_pslli_d:
6960  case Intrinsic::x86_sse2_pslli_q:
6961  case Intrinsic::x86_sse2_psrli_w:
6962  case Intrinsic::x86_sse2_psrli_d:
6963  case Intrinsic::x86_sse2_psrli_q:
6964  case Intrinsic::x86_sse2_psrai_w:
6965  case Intrinsic::x86_sse2_psrai_d:
6966  case Intrinsic::x86_mmx_pslli_w:
6967  case Intrinsic::x86_mmx_pslli_d:
6968  case Intrinsic::x86_mmx_pslli_q:
6969  case Intrinsic::x86_mmx_psrli_w:
6970  case Intrinsic::x86_mmx_psrli_d:
6971  case Intrinsic::x86_mmx_psrli_q:
6972  case Intrinsic::x86_mmx_psrai_w:
6973  case Intrinsic::x86_mmx_psrai_d: {
6974    SDValue ShAmt = Op.getOperand(2);
6975    if (isa<ConstantSDNode>(ShAmt))
6976      return SDValue();
6977
6978    unsigned NewIntNo = 0;
6979    EVT ShAmtVT = MVT::v4i32;
6980    switch (IntNo) {
6981    case Intrinsic::x86_sse2_pslli_w:
6982      NewIntNo = Intrinsic::x86_sse2_psll_w;
6983      break;
6984    case Intrinsic::x86_sse2_pslli_d:
6985      NewIntNo = Intrinsic::x86_sse2_psll_d;
6986      break;
6987    case Intrinsic::x86_sse2_pslli_q:
6988      NewIntNo = Intrinsic::x86_sse2_psll_q;
6989      break;
6990    case Intrinsic::x86_sse2_psrli_w:
6991      NewIntNo = Intrinsic::x86_sse2_psrl_w;
6992      break;
6993    case Intrinsic::x86_sse2_psrli_d:
6994      NewIntNo = Intrinsic::x86_sse2_psrl_d;
6995      break;
6996    case Intrinsic::x86_sse2_psrli_q:
6997      NewIntNo = Intrinsic::x86_sse2_psrl_q;
6998      break;
6999    case Intrinsic::x86_sse2_psrai_w:
7000      NewIntNo = Intrinsic::x86_sse2_psra_w;
7001      break;
7002    case Intrinsic::x86_sse2_psrai_d:
7003      NewIntNo = Intrinsic::x86_sse2_psra_d;
7004      break;
7005    default: {
7006      ShAmtVT = MVT::v2i32;
7007      switch (IntNo) {
7008      case Intrinsic::x86_mmx_pslli_w:
7009        NewIntNo = Intrinsic::x86_mmx_psll_w;
7010        break;
7011      case Intrinsic::x86_mmx_pslli_d:
7012        NewIntNo = Intrinsic::x86_mmx_psll_d;
7013        break;
7014      case Intrinsic::x86_mmx_pslli_q:
7015        NewIntNo = Intrinsic::x86_mmx_psll_q;
7016        break;
7017      case Intrinsic::x86_mmx_psrli_w:
7018        NewIntNo = Intrinsic::x86_mmx_psrl_w;
7019        break;
7020      case Intrinsic::x86_mmx_psrli_d:
7021        NewIntNo = Intrinsic::x86_mmx_psrl_d;
7022        break;
7023      case Intrinsic::x86_mmx_psrli_q:
7024        NewIntNo = Intrinsic::x86_mmx_psrl_q;
7025        break;
7026      case Intrinsic::x86_mmx_psrai_w:
7027        NewIntNo = Intrinsic::x86_mmx_psra_w;
7028        break;
7029      case Intrinsic::x86_mmx_psrai_d:
7030        NewIntNo = Intrinsic::x86_mmx_psra_d;
7031        break;
7032      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
7033      }
7034      break;
7035    }
7036    }
7037
7038    // The vector shift intrinsics with scalars uses 32b shift amounts but
7039    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
7040    // to be zero.
7041    SDValue ShOps[4];
7042    ShOps[0] = ShAmt;
7043    ShOps[1] = DAG.getConstant(0, MVT::i32);
7044    if (ShAmtVT == MVT::v4i32) {
7045      ShOps[2] = DAG.getUNDEF(MVT::i32);
7046      ShOps[3] = DAG.getUNDEF(MVT::i32);
7047      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
7048    } else {
7049      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
7050    }
7051
7052    EVT VT = Op.getValueType();
7053    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
7054    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7055                       DAG.getConstant(NewIntNo, MVT::i32),
7056                       Op.getOperand(1), ShAmt);
7057  }
7058  }
7059}
7060
7061SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
7062                                           SelectionDAG &DAG) const {
7063  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7064  MFI->setReturnAddressIsTaken(true);
7065
7066  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7067  DebugLoc dl = Op.getDebugLoc();
7068
7069  if (Depth > 0) {
7070    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7071    SDValue Offset =
7072      DAG.getConstant(TD->getPointerSize(),
7073                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
7074    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7075                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
7076                                   FrameAddr, Offset),
7077                       NULL, 0, false, false, 0);
7078  }
7079
7080  // Just load the return address.
7081  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
7082  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7083                     RetAddrFI, NULL, 0, false, false, 0);
7084}
7085
7086SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
7087  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7088  MFI->setFrameAddressIsTaken(true);
7089
7090  EVT VT = Op.getValueType();
7091  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
7092  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7093  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
7094  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
7095  while (Depth--)
7096    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
7097                            false, false, 0);
7098  return FrameAddr;
7099}
7100
7101SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
7102                                                     SelectionDAG &DAG) const {
7103  return DAG.getIntPtrConstant(2*TD->getPointerSize());
7104}
7105
7106SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
7107  MachineFunction &MF = DAG.getMachineFunction();
7108  SDValue Chain     = Op.getOperand(0);
7109  SDValue Offset    = Op.getOperand(1);
7110  SDValue Handler   = Op.getOperand(2);
7111  DebugLoc dl       = Op.getDebugLoc();
7112
7113  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
7114                                  getPointerTy());
7115  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
7116
7117  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
7118                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
7119  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
7120  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
7121  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
7122  MF.getRegInfo().addLiveOut(StoreAddrReg);
7123
7124  return DAG.getNode(X86ISD::EH_RETURN, dl,
7125                     MVT::Other,
7126                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
7127}
7128
7129SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
7130                                             SelectionDAG &DAG) const {
7131  SDValue Root = Op.getOperand(0);
7132  SDValue Trmp = Op.getOperand(1); // trampoline
7133  SDValue FPtr = Op.getOperand(2); // nested function
7134  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7135  DebugLoc dl  = Op.getDebugLoc();
7136
7137  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7138
7139  if (Subtarget->is64Bit()) {
7140    SDValue OutChains[6];
7141
7142    // Large code-model.
7143    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
7144    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
7145
7146    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
7147    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
7148
7149    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
7150
7151    // Load the pointer to the nested function into R11.
7152    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
7153    SDValue Addr = Trmp;
7154    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7155                                Addr, TrmpAddr, 0, false, false, 0);
7156
7157    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7158                       DAG.getConstant(2, MVT::i64));
7159    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
7160                                false, false, 2);
7161
7162    // Load the 'nest' parameter value into R10.
7163    // R10 is specified in X86CallingConv.td
7164    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
7165    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7166                       DAG.getConstant(10, MVT::i64));
7167    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7168                                Addr, TrmpAddr, 10, false, false, 0);
7169
7170    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7171                       DAG.getConstant(12, MVT::i64));
7172    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
7173                                false, false, 2);
7174
7175    // Jump to the nested function.
7176    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
7177    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7178                       DAG.getConstant(20, MVT::i64));
7179    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7180                                Addr, TrmpAddr, 20, false, false, 0);
7181
7182    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
7183    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7184                       DAG.getConstant(22, MVT::i64));
7185    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
7186                                TrmpAddr, 22, false, false, 0);
7187
7188    SDValue Ops[] =
7189      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
7190    return DAG.getMergeValues(Ops, 2, dl);
7191  } else {
7192    const Function *Func =
7193      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7194    CallingConv::ID CC = Func->getCallingConv();
7195    unsigned NestReg;
7196
7197    switch (CC) {
7198    default:
7199      llvm_unreachable("Unsupported calling convention");
7200    case CallingConv::C:
7201    case CallingConv::X86_StdCall: {
7202      // Pass 'nest' parameter in ECX.
7203      // Must be kept in sync with X86CallingConv.td
7204      NestReg = X86::ECX;
7205
7206      // Check that ECX wasn't needed by an 'inreg' parameter.
7207      const FunctionType *FTy = Func->getFunctionType();
7208      const AttrListPtr &Attrs = Func->getAttributes();
7209
7210      if (!Attrs.isEmpty() && !Func->isVarArg()) {
7211        unsigned InRegCount = 0;
7212        unsigned Idx = 1;
7213
7214        for (FunctionType::param_iterator I = FTy->param_begin(),
7215             E = FTy->param_end(); I != E; ++I, ++Idx)
7216          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
7217            // FIXME: should only count parameters that are lowered to integers.
7218            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
7219
7220        if (InRegCount > 2) {
7221          report_fatal_error("Nest register in use - reduce number of inreg"
7222                             " parameters!");
7223        }
7224      }
7225      break;
7226    }
7227    case CallingConv::X86_FastCall:
7228    case CallingConv::X86_ThisCall:
7229    case CallingConv::Fast:
7230      // Pass 'nest' parameter in EAX.
7231      // Must be kept in sync with X86CallingConv.td
7232      NestReg = X86::EAX;
7233      break;
7234    }
7235
7236    SDValue OutChains[4];
7237    SDValue Addr, Disp;
7238
7239    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7240                       DAG.getConstant(10, MVT::i32));
7241    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
7242
7243    // This is storing the opcode for MOV32ri.
7244    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
7245    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
7246    OutChains[0] = DAG.getStore(Root, dl,
7247                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
7248                                Trmp, TrmpAddr, 0, false, false, 0);
7249
7250    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7251                       DAG.getConstant(1, MVT::i32));
7252    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
7253                                false, false, 1);
7254
7255    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
7256    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7257                       DAG.getConstant(5, MVT::i32));
7258    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
7259                                TrmpAddr, 5, false, false, 1);
7260
7261    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7262                       DAG.getConstant(6, MVT::i32));
7263    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
7264                                false, false, 1);
7265
7266    SDValue Ops[] =
7267      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
7268    return DAG.getMergeValues(Ops, 2, dl);
7269  }
7270}
7271
7272SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
7273                                            SelectionDAG &DAG) const {
7274  /*
7275   The rounding mode is in bits 11:10 of FPSR, and has the following
7276   settings:
7277     00 Round to nearest
7278     01 Round to -inf
7279     10 Round to +inf
7280     11 Round to 0
7281
7282  FLT_ROUNDS, on the other hand, expects the following:
7283    -1 Undefined
7284     0 Round to 0
7285     1 Round to nearest
7286     2 Round to +inf
7287     3 Round to -inf
7288
7289  To perform the conversion, we do:
7290    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
7291  */
7292
7293  MachineFunction &MF = DAG.getMachineFunction();
7294  const TargetMachine &TM = MF.getTarget();
7295  const TargetFrameInfo &TFI = *TM.getFrameInfo();
7296  unsigned StackAlignment = TFI.getStackAlignment();
7297  EVT VT = Op.getValueType();
7298  DebugLoc dl = Op.getDebugLoc();
7299
7300  // Save FP Control Word to stack slot
7301  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
7302  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7303
7304  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
7305                              DAG.getEntryNode(), StackSlot);
7306
7307  // Load FP Control Word from stack slot
7308  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
7309                            false, false, 0);
7310
7311  // Transform as necessary
7312  SDValue CWD1 =
7313    DAG.getNode(ISD::SRL, dl, MVT::i16,
7314                DAG.getNode(ISD::AND, dl, MVT::i16,
7315                            CWD, DAG.getConstant(0x800, MVT::i16)),
7316                DAG.getConstant(11, MVT::i8));
7317  SDValue CWD2 =
7318    DAG.getNode(ISD::SRL, dl, MVT::i16,
7319                DAG.getNode(ISD::AND, dl, MVT::i16,
7320                            CWD, DAG.getConstant(0x400, MVT::i16)),
7321                DAG.getConstant(9, MVT::i8));
7322
7323  SDValue RetVal =
7324    DAG.getNode(ISD::AND, dl, MVT::i16,
7325                DAG.getNode(ISD::ADD, dl, MVT::i16,
7326                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
7327                            DAG.getConstant(1, MVT::i16)),
7328                DAG.getConstant(3, MVT::i16));
7329
7330
7331  return DAG.getNode((VT.getSizeInBits() < 16 ?
7332                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
7333}
7334
7335SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
7336  EVT VT = Op.getValueType();
7337  EVT OpVT = VT;
7338  unsigned NumBits = VT.getSizeInBits();
7339  DebugLoc dl = Op.getDebugLoc();
7340
7341  Op = Op.getOperand(0);
7342  if (VT == MVT::i8) {
7343    // Zero extend to i32 since there is not an i8 bsr.
7344    OpVT = MVT::i32;
7345    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7346  }
7347
7348  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
7349  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7350  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
7351
7352  // If src is zero (i.e. bsr sets ZF), returns NumBits.
7353  SDValue Ops[] = {
7354    Op,
7355    DAG.getConstant(NumBits+NumBits-1, OpVT),
7356    DAG.getConstant(X86::COND_E, MVT::i8),
7357    Op.getValue(1)
7358  };
7359  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7360
7361  // Finally xor with NumBits-1.
7362  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
7363
7364  if (VT == MVT::i8)
7365    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7366  return Op;
7367}
7368
7369SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
7370  EVT VT = Op.getValueType();
7371  EVT OpVT = VT;
7372  unsigned NumBits = VT.getSizeInBits();
7373  DebugLoc dl = Op.getDebugLoc();
7374
7375  Op = Op.getOperand(0);
7376  if (VT == MVT::i8) {
7377    OpVT = MVT::i32;
7378    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7379  }
7380
7381  // Issue a bsf (scan bits forward) which also sets EFLAGS.
7382  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7383  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
7384
7385  // If src is zero (i.e. bsf sets ZF), returns NumBits.
7386  SDValue Ops[] = {
7387    Op,
7388    DAG.getConstant(NumBits, OpVT),
7389    DAG.getConstant(X86::COND_E, MVT::i8),
7390    Op.getValue(1)
7391  };
7392  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7393
7394  if (VT == MVT::i8)
7395    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7396  return Op;
7397}
7398
7399SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
7400  EVT VT = Op.getValueType();
7401  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
7402  DebugLoc dl = Op.getDebugLoc();
7403
7404  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
7405  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
7406  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
7407  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
7408  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
7409  //
7410  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
7411  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
7412  //  return AloBlo + AloBhi + AhiBlo;
7413
7414  SDValue A = Op.getOperand(0);
7415  SDValue B = Op.getOperand(1);
7416
7417  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7418                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7419                       A, DAG.getConstant(32, MVT::i32));
7420  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7421                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7422                       B, DAG.getConstant(32, MVT::i32));
7423  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7424                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7425                       A, B);
7426  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7427                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7428                       A, Bhi);
7429  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7430                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7431                       Ahi, B);
7432  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7433                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7434                       AloBhi, DAG.getConstant(32, MVT::i32));
7435  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7436                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7437                       AhiBlo, DAG.getConstant(32, MVT::i32));
7438  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
7439  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
7440  return Res;
7441}
7442
7443
7444SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
7445  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
7446  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
7447  // looks for this combo and may remove the "setcc" instruction if the "setcc"
7448  // has only one use.
7449  SDNode *N = Op.getNode();
7450  SDValue LHS = N->getOperand(0);
7451  SDValue RHS = N->getOperand(1);
7452  unsigned BaseOp = 0;
7453  unsigned Cond = 0;
7454  DebugLoc dl = Op.getDebugLoc();
7455
7456  switch (Op.getOpcode()) {
7457  default: llvm_unreachable("Unknown ovf instruction!");
7458  case ISD::SADDO:
7459    // A subtract of one will be selected as a INC. Note that INC doesn't
7460    // set CF, so we can't do this for UADDO.
7461    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7462      if (C->getAPIntValue() == 1) {
7463        BaseOp = X86ISD::INC;
7464        Cond = X86::COND_O;
7465        break;
7466      }
7467    BaseOp = X86ISD::ADD;
7468    Cond = X86::COND_O;
7469    break;
7470  case ISD::UADDO:
7471    BaseOp = X86ISD::ADD;
7472    Cond = X86::COND_B;
7473    break;
7474  case ISD::SSUBO:
7475    // A subtract of one will be selected as a DEC. Note that DEC doesn't
7476    // set CF, so we can't do this for USUBO.
7477    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7478      if (C->getAPIntValue() == 1) {
7479        BaseOp = X86ISD::DEC;
7480        Cond = X86::COND_O;
7481        break;
7482      }
7483    BaseOp = X86ISD::SUB;
7484    Cond = X86::COND_O;
7485    break;
7486  case ISD::USUBO:
7487    BaseOp = X86ISD::SUB;
7488    Cond = X86::COND_B;
7489    break;
7490  case ISD::SMULO:
7491    BaseOp = X86ISD::SMUL;
7492    Cond = X86::COND_O;
7493    break;
7494  case ISD::UMULO:
7495    BaseOp = X86ISD::UMUL;
7496    Cond = X86::COND_B;
7497    break;
7498  }
7499
7500  // Also sets EFLAGS.
7501  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
7502  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
7503
7504  SDValue SetCC =
7505    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
7506                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
7507
7508  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
7509  return Sum;
7510}
7511
7512SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7513  EVT T = Op.getValueType();
7514  DebugLoc dl = Op.getDebugLoc();
7515  unsigned Reg = 0;
7516  unsigned size = 0;
7517  switch(T.getSimpleVT().SimpleTy) {
7518  default:
7519    assert(false && "Invalid value type!");
7520  case MVT::i8:  Reg = X86::AL;  size = 1; break;
7521  case MVT::i16: Reg = X86::AX;  size = 2; break;
7522  case MVT::i32: Reg = X86::EAX; size = 4; break;
7523  case MVT::i64:
7524    assert(Subtarget->is64Bit() && "Node not type legal!");
7525    Reg = X86::RAX; size = 8;
7526    break;
7527  }
7528  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
7529                                    Op.getOperand(2), SDValue());
7530  SDValue Ops[] = { cpIn.getValue(0),
7531                    Op.getOperand(1),
7532                    Op.getOperand(3),
7533                    DAG.getTargetConstant(size, MVT::i8),
7534                    cpIn.getValue(1) };
7535  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7536  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
7537  SDValue cpOut =
7538    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
7539  return cpOut;
7540}
7541
7542SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
7543                                                 SelectionDAG &DAG) const {
7544  assert(Subtarget->is64Bit() && "Result not type legalized?");
7545  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7546  SDValue TheChain = Op.getOperand(0);
7547  DebugLoc dl = Op.getDebugLoc();
7548  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7549  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
7550  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
7551                                   rax.getValue(2));
7552  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
7553                            DAG.getConstant(32, MVT::i8));
7554  SDValue Ops[] = {
7555    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
7556    rdx.getValue(1)
7557  };
7558  return DAG.getMergeValues(Ops, 2, dl);
7559}
7560
7561SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op,
7562                                            SelectionDAG &DAG) const {
7563  EVT SrcVT = Op.getOperand(0).getValueType();
7564  EVT DstVT = Op.getValueType();
7565  assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
7566          Subtarget->hasMMX() && !DisableMMX) &&
7567         "Unexpected custom BIT_CONVERT");
7568  assert((DstVT == MVT::i64 ||
7569          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
7570         "Unexpected custom BIT_CONVERT");
7571  // i64 <=> MMX conversions are Legal.
7572  if (SrcVT==MVT::i64 && DstVT.isVector())
7573    return Op;
7574  if (DstVT==MVT::i64 && SrcVT.isVector())
7575    return Op;
7576  // MMX <=> MMX conversions are Legal.
7577  if (SrcVT.isVector() && DstVT.isVector())
7578    return Op;
7579  // All other conversions need to be expanded.
7580  return SDValue();
7581}
7582SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
7583  SDNode *Node = Op.getNode();
7584  DebugLoc dl = Node->getDebugLoc();
7585  EVT T = Node->getValueType(0);
7586  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
7587                              DAG.getConstant(0, T), Node->getOperand(2));
7588  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
7589                       cast<AtomicSDNode>(Node)->getMemoryVT(),
7590                       Node->getOperand(0),
7591                       Node->getOperand(1), negOp,
7592                       cast<AtomicSDNode>(Node)->getSrcValue(),
7593                       cast<AtomicSDNode>(Node)->getAlignment());
7594}
7595
7596/// LowerOperation - Provide custom lowering hooks for some operations.
7597///
7598SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7599  switch (Op.getOpcode()) {
7600  default: llvm_unreachable("Should not custom lower this!");
7601  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
7602  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
7603  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
7604  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
7605  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
7606  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7607  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
7608  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
7609  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
7610  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
7611  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
7612  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
7613  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
7614  case ISD::SHL_PARTS:
7615  case ISD::SRA_PARTS:
7616  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
7617  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
7618  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
7619  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
7620  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
7621  case ISD::FABS:               return LowerFABS(Op, DAG);
7622  case ISD::FNEG:               return LowerFNEG(Op, DAG);
7623  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
7624  case ISD::SETCC:              return LowerSETCC(Op, DAG);
7625  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
7626  case ISD::SELECT:             return LowerSELECT(Op, DAG);
7627  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
7628  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
7629  case ISD::VASTART:            return LowerVASTART(Op, DAG);
7630  case ISD::VAARG:              return LowerVAARG(Op, DAG);
7631  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
7632  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7633  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
7634  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
7635  case ISD::FRAME_TO_ARGS_OFFSET:
7636                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
7637  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
7638  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
7639  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
7640  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
7641  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
7642  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
7643  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
7644  case ISD::SADDO:
7645  case ISD::UADDO:
7646  case ISD::SSUBO:
7647  case ISD::USUBO:
7648  case ISD::SMULO:
7649  case ISD::UMULO:              return LowerXALUO(Op, DAG);
7650  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
7651  case ISD::BIT_CONVERT:        return LowerBIT_CONVERT(Op, DAG);
7652  }
7653}
7654
7655void X86TargetLowering::
7656ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
7657                        SelectionDAG &DAG, unsigned NewOp) const {
7658  EVT T = Node->getValueType(0);
7659  DebugLoc dl = Node->getDebugLoc();
7660  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
7661
7662  SDValue Chain = Node->getOperand(0);
7663  SDValue In1 = Node->getOperand(1);
7664  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7665                             Node->getOperand(2), DAG.getIntPtrConstant(0));
7666  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7667                             Node->getOperand(2), DAG.getIntPtrConstant(1));
7668  SDValue Ops[] = { Chain, In1, In2L, In2H };
7669  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
7670  SDValue Result =
7671    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
7672                            cast<MemSDNode>(Node)->getMemOperand());
7673  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
7674  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7675  Results.push_back(Result.getValue(2));
7676}
7677
7678/// ReplaceNodeResults - Replace a node with an illegal result type
7679/// with a new node built out of custom code.
7680void X86TargetLowering::ReplaceNodeResults(SDNode *N,
7681                                           SmallVectorImpl<SDValue>&Results,
7682                                           SelectionDAG &DAG) const {
7683  DebugLoc dl = N->getDebugLoc();
7684  switch (N->getOpcode()) {
7685  default:
7686    assert(false && "Do not know how to custom type legalize this operation!");
7687    return;
7688  case ISD::FP_TO_SINT: {
7689    std::pair<SDValue,SDValue> Vals =
7690        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
7691    SDValue FIST = Vals.first, StackSlot = Vals.second;
7692    if (FIST.getNode() != 0) {
7693      EVT VT = N->getValueType(0);
7694      // Return a load from the stack slot.
7695      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
7696                                    false, false, 0));
7697    }
7698    return;
7699  }
7700  case ISD::READCYCLECOUNTER: {
7701    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7702    SDValue TheChain = N->getOperand(0);
7703    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7704    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
7705                                     rd.getValue(1));
7706    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
7707                                     eax.getValue(2));
7708    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
7709    SDValue Ops[] = { eax, edx };
7710    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
7711    Results.push_back(edx.getValue(1));
7712    return;
7713  }
7714  case ISD::ATOMIC_CMP_SWAP: {
7715    EVT T = N->getValueType(0);
7716    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
7717    SDValue cpInL, cpInH;
7718    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7719                        DAG.getConstant(0, MVT::i32));
7720    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7721                        DAG.getConstant(1, MVT::i32));
7722    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
7723    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
7724                             cpInL.getValue(1));
7725    SDValue swapInL, swapInH;
7726    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7727                          DAG.getConstant(0, MVT::i32));
7728    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7729                          DAG.getConstant(1, MVT::i32));
7730    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
7731                               cpInH.getValue(1));
7732    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
7733                               swapInL.getValue(1));
7734    SDValue Ops[] = { swapInH.getValue(0),
7735                      N->getOperand(1),
7736                      swapInH.getValue(1) };
7737    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7738    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
7739    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
7740                                        MVT::i32, Result.getValue(1));
7741    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
7742                                        MVT::i32, cpOutL.getValue(2));
7743    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
7744    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7745    Results.push_back(cpOutH.getValue(1));
7746    return;
7747  }
7748  case ISD::ATOMIC_LOAD_ADD:
7749    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
7750    return;
7751  case ISD::ATOMIC_LOAD_AND:
7752    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
7753    return;
7754  case ISD::ATOMIC_LOAD_NAND:
7755    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
7756    return;
7757  case ISD::ATOMIC_LOAD_OR:
7758    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
7759    return;
7760  case ISD::ATOMIC_LOAD_SUB:
7761    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
7762    return;
7763  case ISD::ATOMIC_LOAD_XOR:
7764    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
7765    return;
7766  case ISD::ATOMIC_SWAP:
7767    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
7768    return;
7769  }
7770}
7771
7772const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7773  switch (Opcode) {
7774  default: return NULL;
7775  case X86ISD::BSF:                return "X86ISD::BSF";
7776  case X86ISD::BSR:                return "X86ISD::BSR";
7777  case X86ISD::SHLD:               return "X86ISD::SHLD";
7778  case X86ISD::SHRD:               return "X86ISD::SHRD";
7779  case X86ISD::FAND:               return "X86ISD::FAND";
7780  case X86ISD::FOR:                return "X86ISD::FOR";
7781  case X86ISD::FXOR:               return "X86ISD::FXOR";
7782  case X86ISD::FSRL:               return "X86ISD::FSRL";
7783  case X86ISD::FILD:               return "X86ISD::FILD";
7784  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
7785  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7786  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7787  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7788  case X86ISD::FLD:                return "X86ISD::FLD";
7789  case X86ISD::FST:                return "X86ISD::FST";
7790  case X86ISD::CALL:               return "X86ISD::CALL";
7791  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
7792  case X86ISD::BT:                 return "X86ISD::BT";
7793  case X86ISD::CMP:                return "X86ISD::CMP";
7794  case X86ISD::COMI:               return "X86ISD::COMI";
7795  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
7796  case X86ISD::SETCC:              return "X86ISD::SETCC";
7797  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
7798  case X86ISD::CMOV:               return "X86ISD::CMOV";
7799  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
7800  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
7801  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
7802  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
7803  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
7804  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
7805  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
7806  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
7807  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
7808  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
7809  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
7810  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
7811  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
7812  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
7813  case X86ISD::FMAX:               return "X86ISD::FMAX";
7814  case X86ISD::FMIN:               return "X86ISD::FMIN";
7815  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
7816  case X86ISD::FRCP:               return "X86ISD::FRCP";
7817  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
7818  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
7819  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7820  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
7821  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
7822  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
7823  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
7824  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
7825  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
7826  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
7827  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
7828  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
7829  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
7830  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
7831  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
7832  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
7833  case X86ISD::VSHL:               return "X86ISD::VSHL";
7834  case X86ISD::VSRL:               return "X86ISD::VSRL";
7835  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
7836  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
7837  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
7838  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
7839  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
7840  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
7841  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
7842  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
7843  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
7844  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
7845  case X86ISD::ADD:                return "X86ISD::ADD";
7846  case X86ISD::SUB:                return "X86ISD::SUB";
7847  case X86ISD::SMUL:               return "X86ISD::SMUL";
7848  case X86ISD::UMUL:               return "X86ISD::UMUL";
7849  case X86ISD::INC:                return "X86ISD::INC";
7850  case X86ISD::DEC:                return "X86ISD::DEC";
7851  case X86ISD::OR:                 return "X86ISD::OR";
7852  case X86ISD::XOR:                return "X86ISD::XOR";
7853  case X86ISD::AND:                return "X86ISD::AND";
7854  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
7855  case X86ISD::PTEST:              return "X86ISD::PTEST";
7856  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
7857  case X86ISD::MINGW_ALLOCA:       return "X86ISD::MINGW_ALLOCA";
7858  }
7859}
7860
7861// isLegalAddressingMode - Return true if the addressing mode represented
7862// by AM is legal for this target, for a load/store of the specified type.
7863bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7864                                              const Type *Ty) const {
7865  // X86 supports extremely general addressing modes.
7866  CodeModel::Model M = getTargetMachine().getCodeModel();
7867
7868  // X86 allows a sign-extended 32-bit immediate field as a displacement.
7869  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7870    return false;
7871
7872  if (AM.BaseGV) {
7873    unsigned GVFlags =
7874      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7875
7876    // If a reference to this global requires an extra load, we can't fold it.
7877    if (isGlobalStubReference(GVFlags))
7878      return false;
7879
7880    // If BaseGV requires a register for the PIC base, we cannot also have a
7881    // BaseReg specified.
7882    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7883      return false;
7884
7885    // If lower 4G is not available, then we must use rip-relative addressing.
7886    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7887      return false;
7888  }
7889
7890  switch (AM.Scale) {
7891  case 0:
7892  case 1:
7893  case 2:
7894  case 4:
7895  case 8:
7896    // These scales always work.
7897    break;
7898  case 3:
7899  case 5:
7900  case 9:
7901    // These scales are formed with basereg+scalereg.  Only accept if there is
7902    // no basereg yet.
7903    if (AM.HasBaseReg)
7904      return false;
7905    break;
7906  default:  // Other stuff never works.
7907    return false;
7908  }
7909
7910  return true;
7911}
7912
7913
7914bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7915  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
7916    return false;
7917  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7918  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7919  if (NumBits1 <= NumBits2)
7920    return false;
7921  return true;
7922}
7923
7924bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7925  if (!VT1.isInteger() || !VT2.isInteger())
7926    return false;
7927  unsigned NumBits1 = VT1.getSizeInBits();
7928  unsigned NumBits2 = VT2.getSizeInBits();
7929  if (NumBits1 <= NumBits2)
7930    return false;
7931  return true;
7932}
7933
7934bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7935  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7936  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
7937}
7938
7939bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7940  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7941  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7942}
7943
7944bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
7945  // i16 instructions are longer (0x66 prefix) and potentially slower.
7946  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7947}
7948
7949/// isShuffleMaskLegal - Targets can use this to indicate that they only
7950/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7951/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7952/// are assumed to be legal.
7953bool
7954X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7955                                      EVT VT) const {
7956  // Very little shuffling can be done for 64-bit vectors right now.
7957  if (VT.getSizeInBits() == 64)
7958    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
7959
7960  // FIXME: pshufb, blends, shifts.
7961  return (VT.getVectorNumElements() == 2 ||
7962          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7963          isMOVLMask(M, VT) ||
7964          isSHUFPMask(M, VT) ||
7965          isPSHUFDMask(M, VT) ||
7966          isPSHUFHWMask(M, VT) ||
7967          isPSHUFLWMask(M, VT) ||
7968          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
7969          isUNPCKLMask(M, VT) ||
7970          isUNPCKHMask(M, VT) ||
7971          isUNPCKL_v_undef_Mask(M, VT) ||
7972          isUNPCKH_v_undef_Mask(M, VT));
7973}
7974
7975bool
7976X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7977                                          EVT VT) const {
7978  unsigned NumElts = VT.getVectorNumElements();
7979  // FIXME: This collection of masks seems suspect.
7980  if (NumElts == 2)
7981    return true;
7982  if (NumElts == 4 && VT.getSizeInBits() == 128) {
7983    return (isMOVLMask(Mask, VT)  ||
7984            isCommutedMOVLMask(Mask, VT, true) ||
7985            isSHUFPMask(Mask, VT) ||
7986            isCommutedSHUFPMask(Mask, VT));
7987  }
7988  return false;
7989}
7990
7991//===----------------------------------------------------------------------===//
7992//                           X86 Scheduler Hooks
7993//===----------------------------------------------------------------------===//
7994
7995// private utility function
7996MachineBasicBlock *
7997X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7998                                                       MachineBasicBlock *MBB,
7999                                                       unsigned regOpc,
8000                                                       unsigned immOpc,
8001                                                       unsigned LoadOpc,
8002                                                       unsigned CXchgOpc,
8003                                                       unsigned notOpc,
8004                                                       unsigned EAXreg,
8005                                                       TargetRegisterClass *RC,
8006                                                       bool invSrc) const {
8007  // For the atomic bitwise operator, we generate
8008  //   thisMBB:
8009  //   newMBB:
8010  //     ld  t1 = [bitinstr.addr]
8011  //     op  t2 = t1, [bitinstr.val]
8012  //     mov EAX = t1
8013  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8014  //     bz  newMBB
8015  //     fallthrough -->nextMBB
8016  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8017  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8018  MachineFunction::iterator MBBIter = MBB;
8019  ++MBBIter;
8020
8021  /// First build the CFG
8022  MachineFunction *F = MBB->getParent();
8023  MachineBasicBlock *thisMBB = MBB;
8024  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8025  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8026  F->insert(MBBIter, newMBB);
8027  F->insert(MBBIter, nextMBB);
8028
8029  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
8030  nextMBB->splice(nextMBB->begin(), thisMBB,
8031                  llvm::next(MachineBasicBlock::iterator(bInstr)),
8032                  thisMBB->end());
8033  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
8034
8035  // Update thisMBB to fall through to newMBB
8036  thisMBB->addSuccessor(newMBB);
8037
8038  // newMBB jumps to itself and fall through to nextMBB
8039  newMBB->addSuccessor(nextMBB);
8040  newMBB->addSuccessor(newMBB);
8041
8042  // Insert instructions into newMBB based on incoming instruction
8043  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
8044         "unexpected number of operands");
8045  DebugLoc dl = bInstr->getDebugLoc();
8046  MachineOperand& destOper = bInstr->getOperand(0);
8047  MachineOperand* argOpers[2 + X86::AddrNumOperands];
8048  int numArgs = bInstr->getNumOperands() - 1;
8049  for (int i=0; i < numArgs; ++i)
8050    argOpers[i] = &bInstr->getOperand(i+1);
8051
8052  // x86 address has 4 operands: base, index, scale, and displacement
8053  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
8054  int valArgIndx = lastAddrIndx + 1;
8055
8056  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8057  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
8058  for (int i=0; i <= lastAddrIndx; ++i)
8059    (*MIB).addOperand(*argOpers[i]);
8060
8061  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
8062  if (invSrc) {
8063    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
8064  }
8065  else
8066    tt = t1;
8067
8068  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8069  assert((argOpers[valArgIndx]->isReg() ||
8070          argOpers[valArgIndx]->isImm()) &&
8071         "invalid operand");
8072  if (argOpers[valArgIndx]->isReg())
8073    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
8074  else
8075    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
8076  MIB.addReg(tt);
8077  (*MIB).addOperand(*argOpers[valArgIndx]);
8078
8079  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
8080  MIB.addReg(t1);
8081
8082  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
8083  for (int i=0; i <= lastAddrIndx; ++i)
8084    (*MIB).addOperand(*argOpers[i]);
8085  MIB.addReg(t2);
8086  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8087  (*MIB).setMemRefs(bInstr->memoperands_begin(),
8088                    bInstr->memoperands_end());
8089
8090  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
8091  MIB.addReg(EAXreg);
8092
8093  // insert branch
8094  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8095
8096  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
8097  return nextMBB;
8098}
8099
8100// private utility function:  64 bit atomics on 32 bit host.
8101MachineBasicBlock *
8102X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
8103                                                       MachineBasicBlock *MBB,
8104                                                       unsigned regOpcL,
8105                                                       unsigned regOpcH,
8106                                                       unsigned immOpcL,
8107                                                       unsigned immOpcH,
8108                                                       bool invSrc) const {
8109  // For the atomic bitwise operator, we generate
8110  //   thisMBB (instructions are in pairs, except cmpxchg8b)
8111  //     ld t1,t2 = [bitinstr.addr]
8112  //   newMBB:
8113  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
8114  //     op  t5, t6 <- out1, out2, [bitinstr.val]
8115  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
8116  //     mov ECX, EBX <- t5, t6
8117  //     mov EAX, EDX <- t1, t2
8118  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
8119  //     mov t3, t4 <- EAX, EDX
8120  //     bz  newMBB
8121  //     result in out1, out2
8122  //     fallthrough -->nextMBB
8123
8124  const TargetRegisterClass *RC = X86::GR32RegisterClass;
8125  const unsigned LoadOpc = X86::MOV32rm;
8126  const unsigned NotOpc = X86::NOT32r;
8127  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8128  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8129  MachineFunction::iterator MBBIter = MBB;
8130  ++MBBIter;
8131
8132  /// First build the CFG
8133  MachineFunction *F = MBB->getParent();
8134  MachineBasicBlock *thisMBB = MBB;
8135  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8136  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8137  F->insert(MBBIter, newMBB);
8138  F->insert(MBBIter, nextMBB);
8139
8140  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
8141  nextMBB->splice(nextMBB->begin(), thisMBB,
8142                  llvm::next(MachineBasicBlock::iterator(bInstr)),
8143                  thisMBB->end());
8144  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
8145
8146  // Update thisMBB to fall through to newMBB
8147  thisMBB->addSuccessor(newMBB);
8148
8149  // newMBB jumps to itself and fall through to nextMBB
8150  newMBB->addSuccessor(nextMBB);
8151  newMBB->addSuccessor(newMBB);
8152
8153  DebugLoc dl = bInstr->getDebugLoc();
8154  // Insert instructions into newMBB based on incoming instruction
8155  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
8156  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
8157         "unexpected number of operands");
8158  MachineOperand& dest1Oper = bInstr->getOperand(0);
8159  MachineOperand& dest2Oper = bInstr->getOperand(1);
8160  MachineOperand* argOpers[2 + X86::AddrNumOperands];
8161  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
8162    argOpers[i] = &bInstr->getOperand(i+2);
8163
8164    // We use some of the operands multiple times, so conservatively just
8165    // clear any kill flags that might be present.
8166    if (argOpers[i]->isReg() && argOpers[i]->isUse())
8167      argOpers[i]->setIsKill(false);
8168  }
8169
8170  // x86 address has 5 operands: base, index, scale, displacement, and segment.
8171  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
8172
8173  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8174  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
8175  for (int i=0; i <= lastAddrIndx; ++i)
8176    (*MIB).addOperand(*argOpers[i]);
8177  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8178  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
8179  // add 4 to displacement.
8180  for (int i=0; i <= lastAddrIndx-2; ++i)
8181    (*MIB).addOperand(*argOpers[i]);
8182  MachineOperand newOp3 = *(argOpers[3]);
8183  if (newOp3.isImm())
8184    newOp3.setImm(newOp3.getImm()+4);
8185  else
8186    newOp3.setOffset(newOp3.getOffset()+4);
8187  (*MIB).addOperand(newOp3);
8188  (*MIB).addOperand(*argOpers[lastAddrIndx]);
8189
8190  // t3/4 are defined later, at the bottom of the loop
8191  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
8192  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
8193  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
8194    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
8195  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
8196    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
8197
8198  // The subsequent operations should be using the destination registers of
8199  //the PHI instructions.
8200  if (invSrc) {
8201    t1 = F->getRegInfo().createVirtualRegister(RC);
8202    t2 = F->getRegInfo().createVirtualRegister(RC);
8203    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
8204    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
8205  } else {
8206    t1 = dest1Oper.getReg();
8207    t2 = dest2Oper.getReg();
8208  }
8209
8210  int valArgIndx = lastAddrIndx + 1;
8211  assert((argOpers[valArgIndx]->isReg() ||
8212          argOpers[valArgIndx]->isImm()) &&
8213         "invalid operand");
8214  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
8215  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
8216  if (argOpers[valArgIndx]->isReg())
8217    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
8218  else
8219    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
8220  if (regOpcL != X86::MOV32rr)
8221    MIB.addReg(t1);
8222  (*MIB).addOperand(*argOpers[valArgIndx]);
8223  assert(argOpers[valArgIndx + 1]->isReg() ==
8224         argOpers[valArgIndx]->isReg());
8225  assert(argOpers[valArgIndx + 1]->isImm() ==
8226         argOpers[valArgIndx]->isImm());
8227  if (argOpers[valArgIndx + 1]->isReg())
8228    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
8229  else
8230    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
8231  if (regOpcH != X86::MOV32rr)
8232    MIB.addReg(t2);
8233  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
8234
8235  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
8236  MIB.addReg(t1);
8237  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
8238  MIB.addReg(t2);
8239
8240  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
8241  MIB.addReg(t5);
8242  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
8243  MIB.addReg(t6);
8244
8245  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
8246  for (int i=0; i <= lastAddrIndx; ++i)
8247    (*MIB).addOperand(*argOpers[i]);
8248
8249  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8250  (*MIB).setMemRefs(bInstr->memoperands_begin(),
8251                    bInstr->memoperands_end());
8252
8253  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
8254  MIB.addReg(X86::EAX);
8255  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
8256  MIB.addReg(X86::EDX);
8257
8258  // insert branch
8259  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8260
8261  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
8262  return nextMBB;
8263}
8264
8265// private utility function
8266MachineBasicBlock *
8267X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
8268                                                      MachineBasicBlock *MBB,
8269                                                      unsigned cmovOpc) const {
8270  // For the atomic min/max operator, we generate
8271  //   thisMBB:
8272  //   newMBB:
8273  //     ld t1 = [min/max.addr]
8274  //     mov t2 = [min/max.val]
8275  //     cmp  t1, t2
8276  //     cmov[cond] t2 = t1
8277  //     mov EAX = t1
8278  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8279  //     bz   newMBB
8280  //     fallthrough -->nextMBB
8281  //
8282  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8283  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8284  MachineFunction::iterator MBBIter = MBB;
8285  ++MBBIter;
8286
8287  /// First build the CFG
8288  MachineFunction *F = MBB->getParent();
8289  MachineBasicBlock *thisMBB = MBB;
8290  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8291  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8292  F->insert(MBBIter, newMBB);
8293  F->insert(MBBIter, nextMBB);
8294
8295  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
8296  nextMBB->splice(nextMBB->begin(), thisMBB,
8297                  llvm::next(MachineBasicBlock::iterator(mInstr)),
8298                  thisMBB->end());
8299  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
8300
8301  // Update thisMBB to fall through to newMBB
8302  thisMBB->addSuccessor(newMBB);
8303
8304  // newMBB jumps to newMBB and fall through to nextMBB
8305  newMBB->addSuccessor(nextMBB);
8306  newMBB->addSuccessor(newMBB);
8307
8308  DebugLoc dl = mInstr->getDebugLoc();
8309  // Insert instructions into newMBB based on incoming instruction
8310  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
8311         "unexpected number of operands");
8312  MachineOperand& destOper = mInstr->getOperand(0);
8313  MachineOperand* argOpers[2 + X86::AddrNumOperands];
8314  int numArgs = mInstr->getNumOperands() - 1;
8315  for (int i=0; i < numArgs; ++i)
8316    argOpers[i] = &mInstr->getOperand(i+1);
8317
8318  // x86 address has 4 operands: base, index, scale, and displacement
8319  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
8320  int valArgIndx = lastAddrIndx + 1;
8321
8322  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8323  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
8324  for (int i=0; i <= lastAddrIndx; ++i)
8325    (*MIB).addOperand(*argOpers[i]);
8326
8327  // We only support register and immediate values
8328  assert((argOpers[valArgIndx]->isReg() ||
8329          argOpers[valArgIndx]->isImm()) &&
8330         "invalid operand");
8331
8332  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8333  if (argOpers[valArgIndx]->isReg())
8334    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
8335  else
8336    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8337  (*MIB).addOperand(*argOpers[valArgIndx]);
8338
8339  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
8340  MIB.addReg(t1);
8341
8342  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
8343  MIB.addReg(t1);
8344  MIB.addReg(t2);
8345
8346  // Generate movc
8347  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8348  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
8349  MIB.addReg(t2);
8350  MIB.addReg(t1);
8351
8352  // Cmp and exchange if none has modified the memory location
8353  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
8354  for (int i=0; i <= lastAddrIndx; ++i)
8355    (*MIB).addOperand(*argOpers[i]);
8356  MIB.addReg(t3);
8357  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8358  (*MIB).setMemRefs(mInstr->memoperands_begin(),
8359                    mInstr->memoperands_end());
8360
8361  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
8362  MIB.addReg(X86::EAX);
8363
8364  // insert branch
8365  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8366
8367  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
8368  return nextMBB;
8369}
8370
8371// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
8372// all of this code can be replaced with that in the .td file.
8373MachineBasicBlock *
8374X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
8375                            unsigned numArgs, bool memArg) const {
8376
8377  DebugLoc dl = MI->getDebugLoc();
8378  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8379
8380  unsigned Opc;
8381  if (memArg)
8382    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
8383  else
8384    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
8385
8386  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
8387
8388  for (unsigned i = 0; i < numArgs; ++i) {
8389    MachineOperand &Op = MI->getOperand(i+1);
8390
8391    if (!(Op.isReg() && Op.isImplicit()))
8392      MIB.addOperand(Op);
8393  }
8394
8395  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
8396    .addReg(X86::XMM0);
8397
8398  MI->eraseFromParent();
8399
8400  return BB;
8401}
8402
8403MachineBasicBlock *
8404X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
8405                                                 MachineInstr *MI,
8406                                                 MachineBasicBlock *MBB) const {
8407  // Emit code to save XMM registers to the stack. The ABI says that the
8408  // number of registers to save is given in %al, so it's theoretically
8409  // possible to do an indirect jump trick to avoid saving all of them,
8410  // however this code takes a simpler approach and just executes all
8411  // of the stores if %al is non-zero. It's less code, and it's probably
8412  // easier on the hardware branch predictor, and stores aren't all that
8413  // expensive anyway.
8414
8415  // Create the new basic blocks. One block contains all the XMM stores,
8416  // and one block is the final destination regardless of whether any
8417  // stores were performed.
8418  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8419  MachineFunction *F = MBB->getParent();
8420  MachineFunction::iterator MBBIter = MBB;
8421  ++MBBIter;
8422  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
8423  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
8424  F->insert(MBBIter, XMMSaveMBB);
8425  F->insert(MBBIter, EndMBB);
8426
8427  // Transfer the remainder of MBB and its successor edges to EndMBB.
8428  EndMBB->splice(EndMBB->begin(), MBB,
8429                 llvm::next(MachineBasicBlock::iterator(MI)),
8430                 MBB->end());
8431  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
8432
8433  // The original block will now fall through to the XMM save block.
8434  MBB->addSuccessor(XMMSaveMBB);
8435  // The XMMSaveMBB will fall through to the end block.
8436  XMMSaveMBB->addSuccessor(EndMBB);
8437
8438  // Now add the instructions.
8439  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8440  DebugLoc DL = MI->getDebugLoc();
8441
8442  unsigned CountReg = MI->getOperand(0).getReg();
8443  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
8444  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
8445
8446  if (!Subtarget->isTargetWin64()) {
8447    // If %al is 0, branch around the XMM save block.
8448    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
8449    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
8450    MBB->addSuccessor(EndMBB);
8451  }
8452
8453  // In the XMM save block, save all the XMM argument registers.
8454  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
8455    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
8456    MachineMemOperand *MMO =
8457      F->getMachineMemOperand(
8458        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
8459        MachineMemOperand::MOStore, Offset,
8460        /*Size=*/16, /*Align=*/16);
8461    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
8462      .addFrameIndex(RegSaveFrameIndex)
8463      .addImm(/*Scale=*/1)
8464      .addReg(/*IndexReg=*/0)
8465      .addImm(/*Disp=*/Offset)
8466      .addReg(/*Segment=*/0)
8467      .addReg(MI->getOperand(i).getReg())
8468      .addMemOperand(MMO);
8469  }
8470
8471  MI->eraseFromParent();   // The pseudo instruction is gone now.
8472
8473  return EndMBB;
8474}
8475
8476MachineBasicBlock *
8477X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
8478                                     MachineBasicBlock *BB) const {
8479  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8480  DebugLoc DL = MI->getDebugLoc();
8481
8482  // To "insert" a SELECT_CC instruction, we actually have to insert the
8483  // diamond control-flow pattern.  The incoming instruction knows the
8484  // destination vreg to set, the condition code register to branch on, the
8485  // true/false values to select between, and a branch opcode to use.
8486  const BasicBlock *LLVM_BB = BB->getBasicBlock();
8487  MachineFunction::iterator It = BB;
8488  ++It;
8489
8490  //  thisMBB:
8491  //  ...
8492  //   TrueVal = ...
8493  //   cmpTY ccX, r1, r2
8494  //   bCC copy1MBB
8495  //   fallthrough --> copy0MBB
8496  MachineBasicBlock *thisMBB = BB;
8497  MachineFunction *F = BB->getParent();
8498  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8499  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
8500  F->insert(It, copy0MBB);
8501  F->insert(It, sinkMBB);
8502
8503  // If the EFLAGS register isn't dead in the terminator, then claim that it's
8504  // live into the sink and copy blocks.
8505  const MachineFunction *MF = BB->getParent();
8506  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
8507  BitVector ReservedRegs = TRI->getReservedRegs(*MF);
8508
8509  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
8510    const MachineOperand &MO = MI->getOperand(I);
8511    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
8512    unsigned Reg = MO.getReg();
8513    if (Reg != X86::EFLAGS) continue;
8514    copy0MBB->addLiveIn(Reg);
8515    sinkMBB->addLiveIn(Reg);
8516  }
8517
8518  // Transfer the remainder of BB and its successor edges to sinkMBB.
8519  sinkMBB->splice(sinkMBB->begin(), BB,
8520                  llvm::next(MachineBasicBlock::iterator(MI)),
8521                  BB->end());
8522  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
8523
8524  // Add the true and fallthrough blocks as its successors.
8525  BB->addSuccessor(copy0MBB);
8526  BB->addSuccessor(sinkMBB);
8527
8528  // Create the conditional branch instruction.
8529  unsigned Opc =
8530    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
8531  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
8532
8533  //  copy0MBB:
8534  //   %FalseValue = ...
8535  //   # fallthrough to sinkMBB
8536  copy0MBB->addSuccessor(sinkMBB);
8537
8538  //  sinkMBB:
8539  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8540  //  ...
8541  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
8542          TII->get(X86::PHI), MI->getOperand(0).getReg())
8543    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
8544    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
8545
8546  MI->eraseFromParent();   // The pseudo instruction is gone now.
8547  return sinkMBB;
8548}
8549
8550MachineBasicBlock *
8551X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
8552                                          MachineBasicBlock *BB) const {
8553  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8554  DebugLoc DL = MI->getDebugLoc();
8555
8556  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
8557  // non-trivial part is impdef of ESP.
8558  // FIXME: The code should be tweaked as soon as we'll try to do codegen for
8559  // mingw-w64.
8560
8561  BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
8562    .addExternalSymbol("_alloca")
8563    .addReg(X86::EAX, RegState::Implicit)
8564    .addReg(X86::ESP, RegState::Implicit)
8565    .addReg(X86::EAX, RegState::Define | RegState::Implicit)
8566    .addReg(X86::ESP, RegState::Define | RegState::Implicit);
8567
8568  MI->eraseFromParent();   // The pseudo instruction is gone now.
8569  return BB;
8570}
8571
8572MachineBasicBlock *
8573X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
8574                                      MachineBasicBlock *BB) const {
8575  // This is pretty easy.  We're taking the value that we received from
8576  // our load from the relocation, sticking it in either RDI (x86-64)
8577  // or EAX and doing an indirect call.  The return value will then
8578  // be in the normal return register.
8579  const X86InstrInfo *TII
8580    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
8581  DebugLoc DL = MI->getDebugLoc();
8582  MachineFunction *F = BB->getParent();
8583
8584  assert(MI->getOperand(3).isGlobal() && "This should be a global");
8585
8586  if (Subtarget->is64Bit()) {
8587    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
8588                                      TII->get(X86::MOV64rm), X86::RDI)
8589    .addReg(X86::RIP)
8590    .addImm(0).addReg(0)
8591    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
8592                      MI->getOperand(3).getTargetFlags())
8593    .addReg(0);
8594    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
8595    addDirectMem(MIB, X86::RDI);
8596  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
8597    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
8598                                      TII->get(X86::MOV32rm), X86::EAX)
8599    .addReg(0)
8600    .addImm(0).addReg(0)
8601    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
8602                      MI->getOperand(3).getTargetFlags())
8603    .addReg(0);
8604    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
8605    addDirectMem(MIB, X86::EAX);
8606  } else {
8607    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
8608                                      TII->get(X86::MOV32rm), X86::EAX)
8609    .addReg(TII->getGlobalBaseReg(F))
8610    .addImm(0).addReg(0)
8611    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
8612                      MI->getOperand(3).getTargetFlags())
8613    .addReg(0);
8614    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
8615    addDirectMem(MIB, X86::EAX);
8616  }
8617
8618  MI->eraseFromParent(); // The pseudo instruction is gone now.
8619  return BB;
8620}
8621
8622MachineBasicBlock *
8623X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
8624                                               MachineBasicBlock *BB) const {
8625  switch (MI->getOpcode()) {
8626  default: assert(false && "Unexpected instr type to insert");
8627  case X86::MINGW_ALLOCA:
8628    return EmitLoweredMingwAlloca(MI, BB);
8629  case X86::TLSCall_32:
8630  case X86::TLSCall_64:
8631    return EmitLoweredTLSCall(MI, BB);
8632  case X86::CMOV_GR8:
8633  case X86::CMOV_V1I64:
8634  case X86::CMOV_FR32:
8635  case X86::CMOV_FR64:
8636  case X86::CMOV_V4F32:
8637  case X86::CMOV_V2F64:
8638  case X86::CMOV_V2I64:
8639  case X86::CMOV_GR16:
8640  case X86::CMOV_GR32:
8641  case X86::CMOV_RFP32:
8642  case X86::CMOV_RFP64:
8643  case X86::CMOV_RFP80:
8644    return EmitLoweredSelect(MI, BB);
8645
8646  case X86::FP32_TO_INT16_IN_MEM:
8647  case X86::FP32_TO_INT32_IN_MEM:
8648  case X86::FP32_TO_INT64_IN_MEM:
8649  case X86::FP64_TO_INT16_IN_MEM:
8650  case X86::FP64_TO_INT32_IN_MEM:
8651  case X86::FP64_TO_INT64_IN_MEM:
8652  case X86::FP80_TO_INT16_IN_MEM:
8653  case X86::FP80_TO_INT32_IN_MEM:
8654  case X86::FP80_TO_INT64_IN_MEM: {
8655    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8656    DebugLoc DL = MI->getDebugLoc();
8657
8658    // Change the floating point control register to use "round towards zero"
8659    // mode when truncating to an integer value.
8660    MachineFunction *F = BB->getParent();
8661    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
8662    addFrameReference(BuildMI(*BB, MI, DL,
8663                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
8664
8665    // Load the old value of the high byte of the control word...
8666    unsigned OldCW =
8667      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
8668    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
8669                      CWFrameIdx);
8670
8671    // Set the high part to be round to zero...
8672    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
8673      .addImm(0xC7F);
8674
8675    // Reload the modified control word now...
8676    addFrameReference(BuildMI(*BB, MI, DL,
8677                              TII->get(X86::FLDCW16m)), CWFrameIdx);
8678
8679    // Restore the memory image of control word to original value
8680    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
8681      .addReg(OldCW);
8682
8683    // Get the X86 opcode to use.
8684    unsigned Opc;
8685    switch (MI->getOpcode()) {
8686    default: llvm_unreachable("illegal opcode!");
8687    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
8688    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
8689    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
8690    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
8691    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
8692    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
8693    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
8694    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
8695    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
8696    }
8697
8698    X86AddressMode AM;
8699    MachineOperand &Op = MI->getOperand(0);
8700    if (Op.isReg()) {
8701      AM.BaseType = X86AddressMode::RegBase;
8702      AM.Base.Reg = Op.getReg();
8703    } else {
8704      AM.BaseType = X86AddressMode::FrameIndexBase;
8705      AM.Base.FrameIndex = Op.getIndex();
8706    }
8707    Op = MI->getOperand(1);
8708    if (Op.isImm())
8709      AM.Scale = Op.getImm();
8710    Op = MI->getOperand(2);
8711    if (Op.isImm())
8712      AM.IndexReg = Op.getImm();
8713    Op = MI->getOperand(3);
8714    if (Op.isGlobal()) {
8715      AM.GV = Op.getGlobal();
8716    } else {
8717      AM.Disp = Op.getImm();
8718    }
8719    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
8720                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
8721
8722    // Reload the original control word now.
8723    addFrameReference(BuildMI(*BB, MI, DL,
8724                              TII->get(X86::FLDCW16m)), CWFrameIdx);
8725
8726    MI->eraseFromParent();   // The pseudo instruction is gone now.
8727    return BB;
8728  }
8729    // String/text processing lowering.
8730  case X86::PCMPISTRM128REG:
8731    return EmitPCMP(MI, BB, 3, false /* in-mem */);
8732  case X86::PCMPISTRM128MEM:
8733    return EmitPCMP(MI, BB, 3, true /* in-mem */);
8734  case X86::PCMPESTRM128REG:
8735    return EmitPCMP(MI, BB, 5, false /* in mem */);
8736  case X86::PCMPESTRM128MEM:
8737    return EmitPCMP(MI, BB, 5, true /* in mem */);
8738
8739    // Atomic Lowering.
8740  case X86::ATOMAND32:
8741    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8742                                               X86::AND32ri, X86::MOV32rm,
8743                                               X86::LCMPXCHG32,
8744                                               X86::NOT32r, X86::EAX,
8745                                               X86::GR32RegisterClass);
8746  case X86::ATOMOR32:
8747    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
8748                                               X86::OR32ri, X86::MOV32rm,
8749                                               X86::LCMPXCHG32,
8750                                               X86::NOT32r, X86::EAX,
8751                                               X86::GR32RegisterClass);
8752  case X86::ATOMXOR32:
8753    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
8754                                               X86::XOR32ri, X86::MOV32rm,
8755                                               X86::LCMPXCHG32,
8756                                               X86::NOT32r, X86::EAX,
8757                                               X86::GR32RegisterClass);
8758  case X86::ATOMNAND32:
8759    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8760                                               X86::AND32ri, X86::MOV32rm,
8761                                               X86::LCMPXCHG32,
8762                                               X86::NOT32r, X86::EAX,
8763                                               X86::GR32RegisterClass, true);
8764  case X86::ATOMMIN32:
8765    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
8766  case X86::ATOMMAX32:
8767    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
8768  case X86::ATOMUMIN32:
8769    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
8770  case X86::ATOMUMAX32:
8771    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
8772
8773  case X86::ATOMAND16:
8774    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8775                                               X86::AND16ri, X86::MOV16rm,
8776                                               X86::LCMPXCHG16,
8777                                               X86::NOT16r, X86::AX,
8778                                               X86::GR16RegisterClass);
8779  case X86::ATOMOR16:
8780    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
8781                                               X86::OR16ri, X86::MOV16rm,
8782                                               X86::LCMPXCHG16,
8783                                               X86::NOT16r, X86::AX,
8784                                               X86::GR16RegisterClass);
8785  case X86::ATOMXOR16:
8786    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
8787                                               X86::XOR16ri, X86::MOV16rm,
8788                                               X86::LCMPXCHG16,
8789                                               X86::NOT16r, X86::AX,
8790                                               X86::GR16RegisterClass);
8791  case X86::ATOMNAND16:
8792    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8793                                               X86::AND16ri, X86::MOV16rm,
8794                                               X86::LCMPXCHG16,
8795                                               X86::NOT16r, X86::AX,
8796                                               X86::GR16RegisterClass, true);
8797  case X86::ATOMMIN16:
8798    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
8799  case X86::ATOMMAX16:
8800    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
8801  case X86::ATOMUMIN16:
8802    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
8803  case X86::ATOMUMAX16:
8804    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
8805
8806  case X86::ATOMAND8:
8807    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8808                                               X86::AND8ri, X86::MOV8rm,
8809                                               X86::LCMPXCHG8,
8810                                               X86::NOT8r, X86::AL,
8811                                               X86::GR8RegisterClass);
8812  case X86::ATOMOR8:
8813    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
8814                                               X86::OR8ri, X86::MOV8rm,
8815                                               X86::LCMPXCHG8,
8816                                               X86::NOT8r, X86::AL,
8817                                               X86::GR8RegisterClass);
8818  case X86::ATOMXOR8:
8819    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
8820                                               X86::XOR8ri, X86::MOV8rm,
8821                                               X86::LCMPXCHG8,
8822                                               X86::NOT8r, X86::AL,
8823                                               X86::GR8RegisterClass);
8824  case X86::ATOMNAND8:
8825    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8826                                               X86::AND8ri, X86::MOV8rm,
8827                                               X86::LCMPXCHG8,
8828                                               X86::NOT8r, X86::AL,
8829                                               X86::GR8RegisterClass, true);
8830  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
8831  // This group is for 64-bit host.
8832  case X86::ATOMAND64:
8833    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8834                                               X86::AND64ri32, X86::MOV64rm,
8835                                               X86::LCMPXCHG64,
8836                                               X86::NOT64r, X86::RAX,
8837                                               X86::GR64RegisterClass);
8838  case X86::ATOMOR64:
8839    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
8840                                               X86::OR64ri32, X86::MOV64rm,
8841                                               X86::LCMPXCHG64,
8842                                               X86::NOT64r, X86::RAX,
8843                                               X86::GR64RegisterClass);
8844  case X86::ATOMXOR64:
8845    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
8846                                               X86::XOR64ri32, X86::MOV64rm,
8847                                               X86::LCMPXCHG64,
8848                                               X86::NOT64r, X86::RAX,
8849                                               X86::GR64RegisterClass);
8850  case X86::ATOMNAND64:
8851    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8852                                               X86::AND64ri32, X86::MOV64rm,
8853                                               X86::LCMPXCHG64,
8854                                               X86::NOT64r, X86::RAX,
8855                                               X86::GR64RegisterClass, true);
8856  case X86::ATOMMIN64:
8857    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
8858  case X86::ATOMMAX64:
8859    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
8860  case X86::ATOMUMIN64:
8861    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
8862  case X86::ATOMUMAX64:
8863    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
8864
8865  // This group does 64-bit operations on a 32-bit host.
8866  case X86::ATOMAND6432:
8867    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8868                                               X86::AND32rr, X86::AND32rr,
8869                                               X86::AND32ri, X86::AND32ri,
8870                                               false);
8871  case X86::ATOMOR6432:
8872    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8873                                               X86::OR32rr, X86::OR32rr,
8874                                               X86::OR32ri, X86::OR32ri,
8875                                               false);
8876  case X86::ATOMXOR6432:
8877    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8878                                               X86::XOR32rr, X86::XOR32rr,
8879                                               X86::XOR32ri, X86::XOR32ri,
8880                                               false);
8881  case X86::ATOMNAND6432:
8882    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8883                                               X86::AND32rr, X86::AND32rr,
8884                                               X86::AND32ri, X86::AND32ri,
8885                                               true);
8886  case X86::ATOMADD6432:
8887    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8888                                               X86::ADD32rr, X86::ADC32rr,
8889                                               X86::ADD32ri, X86::ADC32ri,
8890                                               false);
8891  case X86::ATOMSUB6432:
8892    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8893                                               X86::SUB32rr, X86::SBB32rr,
8894                                               X86::SUB32ri, X86::SBB32ri,
8895                                               false);
8896  case X86::ATOMSWAP6432:
8897    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8898                                               X86::MOV32rr, X86::MOV32rr,
8899                                               X86::MOV32ri, X86::MOV32ri,
8900                                               false);
8901  case X86::VASTART_SAVE_XMM_REGS:
8902    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
8903  }
8904}
8905
8906//===----------------------------------------------------------------------===//
8907//                           X86 Optimization Hooks
8908//===----------------------------------------------------------------------===//
8909
8910void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
8911                                                       const APInt &Mask,
8912                                                       APInt &KnownZero,
8913                                                       APInt &KnownOne,
8914                                                       const SelectionDAG &DAG,
8915                                                       unsigned Depth) const {
8916  unsigned Opc = Op.getOpcode();
8917  assert((Opc >= ISD::BUILTIN_OP_END ||
8918          Opc == ISD::INTRINSIC_WO_CHAIN ||
8919          Opc == ISD::INTRINSIC_W_CHAIN ||
8920          Opc == ISD::INTRINSIC_VOID) &&
8921         "Should use MaskedValueIsZero if you don't know whether Op"
8922         " is a target node!");
8923
8924  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
8925  switch (Opc) {
8926  default: break;
8927  case X86ISD::ADD:
8928  case X86ISD::SUB:
8929  case X86ISD::SMUL:
8930  case X86ISD::UMUL:
8931  case X86ISD::INC:
8932  case X86ISD::DEC:
8933  case X86ISD::OR:
8934  case X86ISD::XOR:
8935  case X86ISD::AND:
8936    // These nodes' second result is a boolean.
8937    if (Op.getResNo() == 0)
8938      break;
8939    // Fallthrough
8940  case X86ISD::SETCC:
8941    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
8942                                       Mask.getBitWidth() - 1);
8943    break;
8944  }
8945}
8946
8947/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
8948/// node is a GlobalAddress + offset.
8949bool X86TargetLowering::isGAPlusOffset(SDNode *N,
8950                                       const GlobalValue* &GA,
8951                                       int64_t &Offset) const {
8952  if (N->getOpcode() == X86ISD::Wrapper) {
8953    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
8954      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
8955      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
8956      return true;
8957    }
8958  }
8959  return TargetLowering::isGAPlusOffset(N, GA, Offset);
8960}
8961
8962/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
8963/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
8964/// if the load addresses are consecutive, non-overlapping, and in the right
8965/// order.
8966static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
8967                                     const TargetLowering &TLI) {
8968  DebugLoc dl = N->getDebugLoc();
8969  EVT VT = N->getValueType(0);
8970  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
8971
8972  if (VT.getSizeInBits() != 128)
8973    return SDValue();
8974
8975  SmallVector<SDValue, 16> Elts;
8976  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
8977    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
8978
8979  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
8980}
8981
8982/// PerformShuffleCombine - Detect vector gather/scatter index generation
8983/// and convert it from being a bunch of shuffles and extracts to a simple
8984/// store and scalar loads to extract the elements.
8985static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
8986                                                const TargetLowering &TLI) {
8987  SDValue InputVector = N->getOperand(0);
8988
8989  // Only operate on vectors of 4 elements, where the alternative shuffling
8990  // gets to be more expensive.
8991  if (InputVector.getValueType() != MVT::v4i32)
8992    return SDValue();
8993
8994  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
8995  // single use which is a sign-extend or zero-extend, and all elements are
8996  // used.
8997  SmallVector<SDNode *, 4> Uses;
8998  unsigned ExtractedElements = 0;
8999  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
9000       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
9001    if (UI.getUse().getResNo() != InputVector.getResNo())
9002      return SDValue();
9003
9004    SDNode *Extract = *UI;
9005    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9006      return SDValue();
9007
9008    if (Extract->getValueType(0) != MVT::i32)
9009      return SDValue();
9010    if (!Extract->hasOneUse())
9011      return SDValue();
9012    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
9013        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
9014      return SDValue();
9015    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
9016      return SDValue();
9017
9018    // Record which element was extracted.
9019    ExtractedElements |=
9020      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
9021
9022    Uses.push_back(Extract);
9023  }
9024
9025  // If not all the elements were used, this may not be worthwhile.
9026  if (ExtractedElements != 15)
9027    return SDValue();
9028
9029  // Ok, we've now decided to do the transformation.
9030  DebugLoc dl = InputVector.getDebugLoc();
9031
9032  // Store the value to a temporary stack slot.
9033  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
9034  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL,
9035                            0, false, false, 0);
9036
9037  // Replace each use (extract) with a load of the appropriate element.
9038  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
9039       UE = Uses.end(); UI != UE; ++UI) {
9040    SDNode *Extract = *UI;
9041
9042    // Compute the element's address.
9043    SDValue Idx = Extract->getOperand(1);
9044    unsigned EltSize =
9045        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
9046    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
9047    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
9048
9049    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
9050                                     OffsetVal, StackPtr);
9051
9052    // Load the scalar.
9053    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
9054                                     ScalarAddr, NULL, 0, false, false, 0);
9055
9056    // Replace the exact with the load.
9057    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
9058  }
9059
9060  // The replacement was made in place; don't return anything.
9061  return SDValue();
9062}
9063
9064/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
9065static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
9066                                    const X86Subtarget *Subtarget) {
9067  DebugLoc DL = N->getDebugLoc();
9068  SDValue Cond = N->getOperand(0);
9069  // Get the LHS/RHS of the select.
9070  SDValue LHS = N->getOperand(1);
9071  SDValue RHS = N->getOperand(2);
9072
9073  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
9074  // instructions match the semantics of the common C idiom x<y?x:y but not
9075  // x<=y?x:y, because of how they handle negative zero (which can be
9076  // ignored in unsafe-math mode).
9077  if (Subtarget->hasSSE2() &&
9078      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
9079      Cond.getOpcode() == ISD::SETCC) {
9080    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
9081
9082    unsigned Opcode = 0;
9083    // Check for x CC y ? x : y.
9084    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
9085        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
9086      switch (CC) {
9087      default: break;
9088      case ISD::SETULT:
9089        // Converting this to a min would handle NaNs incorrectly, and swapping
9090        // the operands would cause it to handle comparisons between positive
9091        // and negative zero incorrectly.
9092        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
9093          if (!UnsafeFPMath &&
9094              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9095            break;
9096          std::swap(LHS, RHS);
9097        }
9098        Opcode = X86ISD::FMIN;
9099        break;
9100      case ISD::SETOLE:
9101        // Converting this to a min would handle comparisons between positive
9102        // and negative zero incorrectly.
9103        if (!UnsafeFPMath &&
9104            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
9105          break;
9106        Opcode = X86ISD::FMIN;
9107        break;
9108      case ISD::SETULE:
9109        // Converting this to a min would handle both negative zeros and NaNs
9110        // incorrectly, but we can swap the operands to fix both.
9111        std::swap(LHS, RHS);
9112      case ISD::SETOLT:
9113      case ISD::SETLT:
9114      case ISD::SETLE:
9115        Opcode = X86ISD::FMIN;
9116        break;
9117
9118      case ISD::SETOGE:
9119        // Converting this to a max would handle comparisons between positive
9120        // and negative zero incorrectly.
9121        if (!UnsafeFPMath &&
9122            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
9123          break;
9124        Opcode = X86ISD::FMAX;
9125        break;
9126      case ISD::SETUGT:
9127        // Converting this to a max would handle NaNs incorrectly, and swapping
9128        // the operands would cause it to handle comparisons between positive
9129        // and negative zero incorrectly.
9130        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
9131          if (!UnsafeFPMath &&
9132              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9133            break;
9134          std::swap(LHS, RHS);
9135        }
9136        Opcode = X86ISD::FMAX;
9137        break;
9138      case ISD::SETUGE:
9139        // Converting this to a max would handle both negative zeros and NaNs
9140        // incorrectly, but we can swap the operands to fix both.
9141        std::swap(LHS, RHS);
9142      case ISD::SETOGT:
9143      case ISD::SETGT:
9144      case ISD::SETGE:
9145        Opcode = X86ISD::FMAX;
9146        break;
9147      }
9148    // Check for x CC y ? y : x -- a min/max with reversed arms.
9149    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
9150               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
9151      switch (CC) {
9152      default: break;
9153      case ISD::SETOGE:
9154        // Converting this to a min would handle comparisons between positive
9155        // and negative zero incorrectly, and swapping the operands would
9156        // cause it to handle NaNs incorrectly.
9157        if (!UnsafeFPMath &&
9158            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
9159          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
9160            break;
9161          std::swap(LHS, RHS);
9162        }
9163        Opcode = X86ISD::FMIN;
9164        break;
9165      case ISD::SETUGT:
9166        // Converting this to a min would handle NaNs incorrectly.
9167        if (!UnsafeFPMath &&
9168            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9169          break;
9170        Opcode = X86ISD::FMIN;
9171        break;
9172      case ISD::SETUGE:
9173        // Converting this to a min would handle both negative zeros and NaNs
9174        // incorrectly, but we can swap the operands to fix both.
9175        std::swap(LHS, RHS);
9176      case ISD::SETOGT:
9177      case ISD::SETGT:
9178      case ISD::SETGE:
9179        Opcode = X86ISD::FMIN;
9180        break;
9181
9182      case ISD::SETULT:
9183        // Converting this to a max would handle NaNs incorrectly.
9184        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
9185          break;
9186        Opcode = X86ISD::FMAX;
9187        break;
9188      case ISD::SETOLE:
9189        // Converting this to a max would handle comparisons between positive
9190        // and negative zero incorrectly, and swapping the operands would
9191        // cause it to handle NaNs incorrectly.
9192        if (!UnsafeFPMath &&
9193            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
9194          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
9195            break;
9196          std::swap(LHS, RHS);
9197        }
9198        Opcode = X86ISD::FMAX;
9199        break;
9200      case ISD::SETULE:
9201        // Converting this to a max would handle both negative zeros and NaNs
9202        // incorrectly, but we can swap the operands to fix both.
9203        std::swap(LHS, RHS);
9204      case ISD::SETOLT:
9205      case ISD::SETLT:
9206      case ISD::SETLE:
9207        Opcode = X86ISD::FMAX;
9208        break;
9209      }
9210    }
9211
9212    if (Opcode)
9213      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
9214  }
9215
9216  // If this is a select between two integer constants, try to do some
9217  // optimizations.
9218  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
9219    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
9220      // Don't do this for crazy integer types.
9221      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
9222        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
9223        // so that TrueC (the true value) is larger than FalseC.
9224        bool NeedsCondInvert = false;
9225
9226        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
9227            // Efficiently invertible.
9228            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
9229             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
9230              isa<ConstantSDNode>(Cond.getOperand(1))))) {
9231          NeedsCondInvert = true;
9232          std::swap(TrueC, FalseC);
9233        }
9234
9235        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
9236        if (FalseC->getAPIntValue() == 0 &&
9237            TrueC->getAPIntValue().isPowerOf2()) {
9238          if (NeedsCondInvert) // Invert the condition if needed.
9239            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9240                               DAG.getConstant(1, Cond.getValueType()));
9241
9242          // Zero extend the condition if needed.
9243          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
9244
9245          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9246          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
9247                             DAG.getConstant(ShAmt, MVT::i8));
9248        }
9249
9250        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
9251        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9252          if (NeedsCondInvert) // Invert the condition if needed.
9253            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9254                               DAG.getConstant(1, Cond.getValueType()));
9255
9256          // Zero extend the condition if needed.
9257          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9258                             FalseC->getValueType(0), Cond);
9259          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9260                             SDValue(FalseC, 0));
9261        }
9262
9263        // Optimize cases that will turn into an LEA instruction.  This requires
9264        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9265        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9266          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9267          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9268
9269          bool isFastMultiplier = false;
9270          if (Diff < 10) {
9271            switch ((unsigned char)Diff) {
9272              default: break;
9273              case 1:  // result = add base, cond
9274              case 2:  // result = lea base(    , cond*2)
9275              case 3:  // result = lea base(cond, cond*2)
9276              case 4:  // result = lea base(    , cond*4)
9277              case 5:  // result = lea base(cond, cond*4)
9278              case 8:  // result = lea base(    , cond*8)
9279              case 9:  // result = lea base(cond, cond*8)
9280                isFastMultiplier = true;
9281                break;
9282            }
9283          }
9284
9285          if (isFastMultiplier) {
9286            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9287            if (NeedsCondInvert) // Invert the condition if needed.
9288              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9289                                 DAG.getConstant(1, Cond.getValueType()));
9290
9291            // Zero extend the condition if needed.
9292            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9293                               Cond);
9294            // Scale the condition by the difference.
9295            if (Diff != 1)
9296              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9297                                 DAG.getConstant(Diff, Cond.getValueType()));
9298
9299            // Add the base if non-zero.
9300            if (FalseC->getAPIntValue() != 0)
9301              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9302                                 SDValue(FalseC, 0));
9303            return Cond;
9304          }
9305        }
9306      }
9307  }
9308
9309  return SDValue();
9310}
9311
9312/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
9313static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
9314                                  TargetLowering::DAGCombinerInfo &DCI) {
9315  DebugLoc DL = N->getDebugLoc();
9316
9317  // If the flag operand isn't dead, don't touch this CMOV.
9318  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
9319    return SDValue();
9320
9321  // If this is a select between two integer constants, try to do some
9322  // optimizations.  Note that the operands are ordered the opposite of SELECT
9323  // operands.
9324  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
9325    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
9326      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
9327      // larger than FalseC (the false value).
9328      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
9329
9330      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
9331        CC = X86::GetOppositeBranchCondition(CC);
9332        std::swap(TrueC, FalseC);
9333      }
9334
9335      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
9336      // This is efficient for any integer data type (including i8/i16) and
9337      // shift amount.
9338      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
9339        SDValue Cond = N->getOperand(3);
9340        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9341                           DAG.getConstant(CC, MVT::i8), Cond);
9342
9343        // Zero extend the condition if needed.
9344        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
9345
9346        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9347        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
9348                           DAG.getConstant(ShAmt, MVT::i8));
9349        if (N->getNumValues() == 2)  // Dead flag value?
9350          return DCI.CombineTo(N, Cond, SDValue());
9351        return Cond;
9352      }
9353
9354      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
9355      // for any integer data type, including i8/i16.
9356      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9357        SDValue Cond = N->getOperand(3);
9358        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9359                           DAG.getConstant(CC, MVT::i8), Cond);
9360
9361        // Zero extend the condition if needed.
9362        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9363                           FalseC->getValueType(0), Cond);
9364        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9365                           SDValue(FalseC, 0));
9366
9367        if (N->getNumValues() == 2)  // Dead flag value?
9368          return DCI.CombineTo(N, Cond, SDValue());
9369        return Cond;
9370      }
9371
9372      // Optimize cases that will turn into an LEA instruction.  This requires
9373      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9374      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9375        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9376        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9377
9378        bool isFastMultiplier = false;
9379        if (Diff < 10) {
9380          switch ((unsigned char)Diff) {
9381          default: break;
9382          case 1:  // result = add base, cond
9383          case 2:  // result = lea base(    , cond*2)
9384          case 3:  // result = lea base(cond, cond*2)
9385          case 4:  // result = lea base(    , cond*4)
9386          case 5:  // result = lea base(cond, cond*4)
9387          case 8:  // result = lea base(    , cond*8)
9388          case 9:  // result = lea base(cond, cond*8)
9389            isFastMultiplier = true;
9390            break;
9391          }
9392        }
9393
9394        if (isFastMultiplier) {
9395          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9396          SDValue Cond = N->getOperand(3);
9397          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9398                             DAG.getConstant(CC, MVT::i8), Cond);
9399          // Zero extend the condition if needed.
9400          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9401                             Cond);
9402          // Scale the condition by the difference.
9403          if (Diff != 1)
9404            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9405                               DAG.getConstant(Diff, Cond.getValueType()));
9406
9407          // Add the base if non-zero.
9408          if (FalseC->getAPIntValue() != 0)
9409            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9410                               SDValue(FalseC, 0));
9411          if (N->getNumValues() == 2)  // Dead flag value?
9412            return DCI.CombineTo(N, Cond, SDValue());
9413          return Cond;
9414        }
9415      }
9416    }
9417  }
9418  return SDValue();
9419}
9420
9421
9422/// PerformMulCombine - Optimize a single multiply with constant into two
9423/// in order to implement it with two cheaper instructions, e.g.
9424/// LEA + SHL, LEA + LEA.
9425static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
9426                                 TargetLowering::DAGCombinerInfo &DCI) {
9427  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9428    return SDValue();
9429
9430  EVT VT = N->getValueType(0);
9431  if (VT != MVT::i64)
9432    return SDValue();
9433
9434  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9435  if (!C)
9436    return SDValue();
9437  uint64_t MulAmt = C->getZExtValue();
9438  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
9439    return SDValue();
9440
9441  uint64_t MulAmt1 = 0;
9442  uint64_t MulAmt2 = 0;
9443  if ((MulAmt % 9) == 0) {
9444    MulAmt1 = 9;
9445    MulAmt2 = MulAmt / 9;
9446  } else if ((MulAmt % 5) == 0) {
9447    MulAmt1 = 5;
9448    MulAmt2 = MulAmt / 5;
9449  } else if ((MulAmt % 3) == 0) {
9450    MulAmt1 = 3;
9451    MulAmt2 = MulAmt / 3;
9452  }
9453  if (MulAmt2 &&
9454      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
9455    DebugLoc DL = N->getDebugLoc();
9456
9457    if (isPowerOf2_64(MulAmt2) &&
9458        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
9459      // If second multiplifer is pow2, issue it first. We want the multiply by
9460      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
9461      // is an add.
9462      std::swap(MulAmt1, MulAmt2);
9463
9464    SDValue NewMul;
9465    if (isPowerOf2_64(MulAmt1))
9466      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
9467                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
9468    else
9469      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
9470                           DAG.getConstant(MulAmt1, VT));
9471
9472    if (isPowerOf2_64(MulAmt2))
9473      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
9474                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
9475    else
9476      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
9477                           DAG.getConstant(MulAmt2, VT));
9478
9479    // Do not add new nodes to DAG combiner worklist.
9480    DCI.CombineTo(N, NewMul, false);
9481  }
9482  return SDValue();
9483}
9484
9485static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
9486  SDValue N0 = N->getOperand(0);
9487  SDValue N1 = N->getOperand(1);
9488  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9489  EVT VT = N0.getValueType();
9490
9491  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
9492  // since the result of setcc_c is all zero's or all ones.
9493  if (N1C && N0.getOpcode() == ISD::AND &&
9494      N0.getOperand(1).getOpcode() == ISD::Constant) {
9495    SDValue N00 = N0.getOperand(0);
9496    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
9497        ((N00.getOpcode() == ISD::ANY_EXTEND ||
9498          N00.getOpcode() == ISD::ZERO_EXTEND) &&
9499         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
9500      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
9501      APInt ShAmt = N1C->getAPIntValue();
9502      Mask = Mask.shl(ShAmt);
9503      if (Mask != 0)
9504        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
9505                           N00, DAG.getConstant(Mask, VT));
9506    }
9507  }
9508
9509  return SDValue();
9510}
9511
9512/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
9513///                       when possible.
9514static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
9515                                   const X86Subtarget *Subtarget) {
9516  EVT VT = N->getValueType(0);
9517  if (!VT.isVector() && VT.isInteger() &&
9518      N->getOpcode() == ISD::SHL)
9519    return PerformSHLCombine(N, DAG);
9520
9521  // On X86 with SSE2 support, we can transform this to a vector shift if
9522  // all elements are shifted by the same amount.  We can't do this in legalize
9523  // because the a constant vector is typically transformed to a constant pool
9524  // so we have no knowledge of the shift amount.
9525  if (!Subtarget->hasSSE2())
9526    return SDValue();
9527
9528  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
9529    return SDValue();
9530
9531  SDValue ShAmtOp = N->getOperand(1);
9532  EVT EltVT = VT.getVectorElementType();
9533  DebugLoc DL = N->getDebugLoc();
9534  SDValue BaseShAmt = SDValue();
9535  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
9536    unsigned NumElts = VT.getVectorNumElements();
9537    unsigned i = 0;
9538    for (; i != NumElts; ++i) {
9539      SDValue Arg = ShAmtOp.getOperand(i);
9540      if (Arg.getOpcode() == ISD::UNDEF) continue;
9541      BaseShAmt = Arg;
9542      break;
9543    }
9544    for (; i != NumElts; ++i) {
9545      SDValue Arg = ShAmtOp.getOperand(i);
9546      if (Arg.getOpcode() == ISD::UNDEF) continue;
9547      if (Arg != BaseShAmt) {
9548        return SDValue();
9549      }
9550    }
9551  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
9552             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
9553    SDValue InVec = ShAmtOp.getOperand(0);
9554    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
9555      unsigned NumElts = InVec.getValueType().getVectorNumElements();
9556      unsigned i = 0;
9557      for (; i != NumElts; ++i) {
9558        SDValue Arg = InVec.getOperand(i);
9559        if (Arg.getOpcode() == ISD::UNDEF) continue;
9560        BaseShAmt = Arg;
9561        break;
9562      }
9563    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
9564       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
9565         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
9566         if (C->getZExtValue() == SplatIdx)
9567           BaseShAmt = InVec.getOperand(1);
9568       }
9569    }
9570    if (BaseShAmt.getNode() == 0)
9571      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
9572                              DAG.getIntPtrConstant(0));
9573  } else
9574    return SDValue();
9575
9576  // The shift amount is an i32.
9577  if (EltVT.bitsGT(MVT::i32))
9578    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
9579  else if (EltVT.bitsLT(MVT::i32))
9580    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
9581
9582  // The shift amount is identical so we can do a vector shift.
9583  SDValue  ValOp = N->getOperand(0);
9584  switch (N->getOpcode()) {
9585  default:
9586    llvm_unreachable("Unknown shift opcode!");
9587    break;
9588  case ISD::SHL:
9589    if (VT == MVT::v2i64)
9590      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9591                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9592                         ValOp, BaseShAmt);
9593    if (VT == MVT::v4i32)
9594      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9595                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9596                         ValOp, BaseShAmt);
9597    if (VT == MVT::v8i16)
9598      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9599                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9600                         ValOp, BaseShAmt);
9601    break;
9602  case ISD::SRA:
9603    if (VT == MVT::v4i32)
9604      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9605                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
9606                         ValOp, BaseShAmt);
9607    if (VT == MVT::v8i16)
9608      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9609                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
9610                         ValOp, BaseShAmt);
9611    break;
9612  case ISD::SRL:
9613    if (VT == MVT::v2i64)
9614      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9615                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9616                         ValOp, BaseShAmt);
9617    if (VT == MVT::v4i32)
9618      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9619                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
9620                         ValOp, BaseShAmt);
9621    if (VT ==  MVT::v8i16)
9622      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9623                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
9624                         ValOp, BaseShAmt);
9625    break;
9626  }
9627  return SDValue();
9628}
9629
9630static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
9631                                TargetLowering::DAGCombinerInfo &DCI,
9632                                const X86Subtarget *Subtarget) {
9633  if (DCI.isBeforeLegalizeOps())
9634    return SDValue();
9635
9636  EVT VT = N->getValueType(0);
9637  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
9638    return SDValue();
9639
9640  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
9641  SDValue N0 = N->getOperand(0);
9642  SDValue N1 = N->getOperand(1);
9643  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
9644    std::swap(N0, N1);
9645  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
9646    return SDValue();
9647  if (!N0.hasOneUse() || !N1.hasOneUse())
9648    return SDValue();
9649
9650  SDValue ShAmt0 = N0.getOperand(1);
9651  if (ShAmt0.getValueType() != MVT::i8)
9652    return SDValue();
9653  SDValue ShAmt1 = N1.getOperand(1);
9654  if (ShAmt1.getValueType() != MVT::i8)
9655    return SDValue();
9656  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
9657    ShAmt0 = ShAmt0.getOperand(0);
9658  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
9659    ShAmt1 = ShAmt1.getOperand(0);
9660
9661  DebugLoc DL = N->getDebugLoc();
9662  unsigned Opc = X86ISD::SHLD;
9663  SDValue Op0 = N0.getOperand(0);
9664  SDValue Op1 = N1.getOperand(0);
9665  if (ShAmt0.getOpcode() == ISD::SUB) {
9666    Opc = X86ISD::SHRD;
9667    std::swap(Op0, Op1);
9668    std::swap(ShAmt0, ShAmt1);
9669  }
9670
9671  unsigned Bits = VT.getSizeInBits();
9672  if (ShAmt1.getOpcode() == ISD::SUB) {
9673    SDValue Sum = ShAmt1.getOperand(0);
9674    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
9675      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
9676      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
9677        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
9678      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
9679        return DAG.getNode(Opc, DL, VT,
9680                           Op0, Op1,
9681                           DAG.getNode(ISD::TRUNCATE, DL,
9682                                       MVT::i8, ShAmt0));
9683    }
9684  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
9685    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
9686    if (ShAmt0C &&
9687        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
9688      return DAG.getNode(Opc, DL, VT,
9689                         N0.getOperand(0), N1.getOperand(0),
9690                         DAG.getNode(ISD::TRUNCATE, DL,
9691                                       MVT::i8, ShAmt0));
9692  }
9693
9694  return SDValue();
9695}
9696
9697/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
9698static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
9699                                   const X86Subtarget *Subtarget) {
9700  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
9701  // the FP state in cases where an emms may be missing.
9702  // A preferable solution to the general problem is to figure out the right
9703  // places to insert EMMS.  This qualifies as a quick hack.
9704
9705  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
9706  StoreSDNode *St = cast<StoreSDNode>(N);
9707  EVT VT = St->getValue().getValueType();
9708  if (VT.getSizeInBits() != 64)
9709    return SDValue();
9710
9711  const Function *F = DAG.getMachineFunction().getFunction();
9712  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
9713  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
9714    && Subtarget->hasSSE2();
9715  if ((VT.isVector() ||
9716       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
9717      isa<LoadSDNode>(St->getValue()) &&
9718      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
9719      St->getChain().hasOneUse() && !St->isVolatile()) {
9720    SDNode* LdVal = St->getValue().getNode();
9721    LoadSDNode *Ld = 0;
9722    int TokenFactorIndex = -1;
9723    SmallVector<SDValue, 8> Ops;
9724    SDNode* ChainVal = St->getChain().getNode();
9725    // Must be a store of a load.  We currently handle two cases:  the load
9726    // is a direct child, and it's under an intervening TokenFactor.  It is
9727    // possible to dig deeper under nested TokenFactors.
9728    if (ChainVal == LdVal)
9729      Ld = cast<LoadSDNode>(St->getChain());
9730    else if (St->getValue().hasOneUse() &&
9731             ChainVal->getOpcode() == ISD::TokenFactor) {
9732      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
9733        if (ChainVal->getOperand(i).getNode() == LdVal) {
9734          TokenFactorIndex = i;
9735          Ld = cast<LoadSDNode>(St->getValue());
9736        } else
9737          Ops.push_back(ChainVal->getOperand(i));
9738      }
9739    }
9740
9741    if (!Ld || !ISD::isNormalLoad(Ld))
9742      return SDValue();
9743
9744    // If this is not the MMX case, i.e. we are just turning i64 load/store
9745    // into f64 load/store, avoid the transformation if there are multiple
9746    // uses of the loaded value.
9747    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
9748      return SDValue();
9749
9750    DebugLoc LdDL = Ld->getDebugLoc();
9751    DebugLoc StDL = N->getDebugLoc();
9752    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
9753    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
9754    // pair instead.
9755    if (Subtarget->is64Bit() || F64IsLegal) {
9756      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
9757      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
9758                                  Ld->getBasePtr(), Ld->getSrcValue(),
9759                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
9760                                  Ld->isNonTemporal(), Ld->getAlignment());
9761      SDValue NewChain = NewLd.getValue(1);
9762      if (TokenFactorIndex != -1) {
9763        Ops.push_back(NewChain);
9764        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9765                               Ops.size());
9766      }
9767      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
9768                          St->getSrcValue(), St->getSrcValueOffset(),
9769                          St->isVolatile(), St->isNonTemporal(),
9770                          St->getAlignment());
9771    }
9772
9773    // Otherwise, lower to two pairs of 32-bit loads / stores.
9774    SDValue LoAddr = Ld->getBasePtr();
9775    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
9776                                 DAG.getConstant(4, MVT::i32));
9777
9778    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
9779                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
9780                               Ld->isVolatile(), Ld->isNonTemporal(),
9781                               Ld->getAlignment());
9782    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
9783                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
9784                               Ld->isVolatile(), Ld->isNonTemporal(),
9785                               MinAlign(Ld->getAlignment(), 4));
9786
9787    SDValue NewChain = LoLd.getValue(1);
9788    if (TokenFactorIndex != -1) {
9789      Ops.push_back(LoLd);
9790      Ops.push_back(HiLd);
9791      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9792                             Ops.size());
9793    }
9794
9795    LoAddr = St->getBasePtr();
9796    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
9797                         DAG.getConstant(4, MVT::i32));
9798
9799    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
9800                                St->getSrcValue(), St->getSrcValueOffset(),
9801                                St->isVolatile(), St->isNonTemporal(),
9802                                St->getAlignment());
9803    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
9804                                St->getSrcValue(),
9805                                St->getSrcValueOffset() + 4,
9806                                St->isVolatile(),
9807                                St->isNonTemporal(),
9808                                MinAlign(St->getAlignment(), 4));
9809    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
9810  }
9811  return SDValue();
9812}
9813
9814/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
9815/// X86ISD::FXOR nodes.
9816static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
9817  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
9818  // F[X]OR(0.0, x) -> x
9819  // F[X]OR(x, 0.0) -> x
9820  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9821    if (C->getValueAPF().isPosZero())
9822      return N->getOperand(1);
9823  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9824    if (C->getValueAPF().isPosZero())
9825      return N->getOperand(0);
9826  return SDValue();
9827}
9828
9829/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
9830static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
9831  // FAND(0.0, x) -> 0.0
9832  // FAND(x, 0.0) -> 0.0
9833  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9834    if (C->getValueAPF().isPosZero())
9835      return N->getOperand(0);
9836  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9837    if (C->getValueAPF().isPosZero())
9838      return N->getOperand(1);
9839  return SDValue();
9840}
9841
9842static SDValue PerformBTCombine(SDNode *N,
9843                                SelectionDAG &DAG,
9844                                TargetLowering::DAGCombinerInfo &DCI) {
9845  // BT ignores high bits in the bit index operand.
9846  SDValue Op1 = N->getOperand(1);
9847  if (Op1.hasOneUse()) {
9848    unsigned BitWidth = Op1.getValueSizeInBits();
9849    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
9850    APInt KnownZero, KnownOne;
9851    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9852                                          !DCI.isBeforeLegalizeOps());
9853    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9854    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
9855        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
9856      DCI.CommitTargetLoweringOpt(TLO);
9857  }
9858  return SDValue();
9859}
9860
9861static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
9862  SDValue Op = N->getOperand(0);
9863  if (Op.getOpcode() == ISD::BIT_CONVERT)
9864    Op = Op.getOperand(0);
9865  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
9866  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
9867      VT.getVectorElementType().getSizeInBits() ==
9868      OpVT.getVectorElementType().getSizeInBits()) {
9869    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
9870  }
9871  return SDValue();
9872}
9873
9874static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
9875  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
9876  //           (and (i32 x86isd::setcc_carry), 1)
9877  // This eliminates the zext. This transformation is necessary because
9878  // ISD::SETCC is always legalized to i8.
9879  DebugLoc dl = N->getDebugLoc();
9880  SDValue N0 = N->getOperand(0);
9881  EVT VT = N->getValueType(0);
9882  if (N0.getOpcode() == ISD::AND &&
9883      N0.hasOneUse() &&
9884      N0.getOperand(0).hasOneUse()) {
9885    SDValue N00 = N0.getOperand(0);
9886    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
9887      return SDValue();
9888    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9889    if (!C || C->getZExtValue() != 1)
9890      return SDValue();
9891    return DAG.getNode(ISD::AND, dl, VT,
9892                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
9893                                   N00.getOperand(0), N00.getOperand(1)),
9894                       DAG.getConstant(1, VT));
9895  }
9896
9897  return SDValue();
9898}
9899
9900SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
9901                                             DAGCombinerInfo &DCI) const {
9902  SelectionDAG &DAG = DCI.DAG;
9903  switch (N->getOpcode()) {
9904  default: break;
9905  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
9906  case ISD::EXTRACT_VECTOR_ELT:
9907                        return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
9908  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
9909  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
9910  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
9911  case ISD::SHL:
9912  case ISD::SRA:
9913  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
9914  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
9915  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
9916  case X86ISD::FXOR:
9917  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
9918  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
9919  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
9920  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
9921  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
9922  }
9923
9924  return SDValue();
9925}
9926
9927/// isTypeDesirableForOp - Return true if the target has native support for
9928/// the specified value type and it is 'desirable' to use the type for the
9929/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
9930/// instruction encodings are longer and some i16 instructions are slow.
9931bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
9932  if (!isTypeLegal(VT))
9933    return false;
9934  if (VT != MVT::i16)
9935    return true;
9936
9937  switch (Opc) {
9938  default:
9939    return true;
9940  case ISD::LOAD:
9941  case ISD::SIGN_EXTEND:
9942  case ISD::ZERO_EXTEND:
9943  case ISD::ANY_EXTEND:
9944  case ISD::SHL:
9945  case ISD::SRL:
9946  case ISD::SUB:
9947  case ISD::ADD:
9948  case ISD::MUL:
9949  case ISD::AND:
9950  case ISD::OR:
9951  case ISD::XOR:
9952    return false;
9953  }
9954}
9955
9956static bool MayFoldLoad(SDValue Op) {
9957  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
9958}
9959
9960static bool MayFoldIntoStore(SDValue Op) {
9961  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
9962}
9963
9964/// IsDesirableToPromoteOp - This method query the target whether it is
9965/// beneficial for dag combiner to promote the specified node. If true, it
9966/// should return the desired promotion type by reference.
9967bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
9968  EVT VT = Op.getValueType();
9969  if (VT != MVT::i16)
9970    return false;
9971
9972  bool Promote = false;
9973  bool Commute = false;
9974  switch (Op.getOpcode()) {
9975  default: break;
9976  case ISD::LOAD: {
9977    LoadSDNode *LD = cast<LoadSDNode>(Op);
9978    // If the non-extending load has a single use and it's not live out, then it
9979    // might be folded.
9980    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
9981                                                     Op.hasOneUse()*/) {
9982      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9983             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
9984        // The only case where we'd want to promote LOAD (rather then it being
9985        // promoted as an operand is when it's only use is liveout.
9986        if (UI->getOpcode() != ISD::CopyToReg)
9987          return false;
9988      }
9989    }
9990    Promote = true;
9991    break;
9992  }
9993  case ISD::SIGN_EXTEND:
9994  case ISD::ZERO_EXTEND:
9995  case ISD::ANY_EXTEND:
9996    Promote = true;
9997    break;
9998  case ISD::SHL:
9999  case ISD::SRL: {
10000    SDValue N0 = Op.getOperand(0);
10001    // Look out for (store (shl (load), x)).
10002    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
10003      return false;
10004    Promote = true;
10005    break;
10006  }
10007  case ISD::ADD:
10008  case ISD::MUL:
10009  case ISD::AND:
10010  case ISD::OR:
10011  case ISD::XOR:
10012    Commute = true;
10013    // fallthrough
10014  case ISD::SUB: {
10015    SDValue N0 = Op.getOperand(0);
10016    SDValue N1 = Op.getOperand(1);
10017    if (!Commute && MayFoldLoad(N1))
10018      return false;
10019    // Avoid disabling potential load folding opportunities.
10020    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
10021      return false;
10022    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
10023      return false;
10024    Promote = true;
10025  }
10026  }
10027
10028  PVT = MVT::i32;
10029  return Promote;
10030}
10031
10032//===----------------------------------------------------------------------===//
10033//                           X86 Inline Assembly Support
10034//===----------------------------------------------------------------------===//
10035
10036static bool LowerToBSwap(CallInst *CI) {
10037  // FIXME: this should verify that we are targetting a 486 or better.  If not,
10038  // we will turn this bswap into something that will be lowered to logical ops
10039  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
10040  // so don't worry about this.
10041
10042  // Verify this is a simple bswap.
10043  if (CI->getNumArgOperands() != 1 ||
10044      CI->getType() != CI->getArgOperand(0)->getType() ||
10045      !CI->getType()->isIntegerTy())
10046    return false;
10047
10048  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
10049  if (!Ty || Ty->getBitWidth() % 16 != 0)
10050    return false;
10051
10052  // Okay, we can do this xform, do so now.
10053  const Type *Tys[] = { Ty };
10054  Module *M = CI->getParent()->getParent()->getParent();
10055  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
10056
10057  Value *Op = CI->getArgOperand(0);
10058  Op = CallInst::Create(Int, Op, CI->getName(), CI);
10059
10060  CI->replaceAllUsesWith(Op);
10061  CI->eraseFromParent();
10062  return true;
10063}
10064
10065bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
10066  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
10067  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
10068
10069  std::string AsmStr = IA->getAsmString();
10070
10071  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
10072  SmallVector<StringRef, 4> AsmPieces;
10073  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
10074
10075  switch (AsmPieces.size()) {
10076  default: return false;
10077  case 1:
10078    AsmStr = AsmPieces[0];
10079    AsmPieces.clear();
10080    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
10081
10082    // bswap $0
10083    if (AsmPieces.size() == 2 &&
10084        (AsmPieces[0] == "bswap" ||
10085         AsmPieces[0] == "bswapq" ||
10086         AsmPieces[0] == "bswapl") &&
10087        (AsmPieces[1] == "$0" ||
10088         AsmPieces[1] == "${0:q}")) {
10089      // No need to check constraints, nothing other than the equivalent of
10090      // "=r,0" would be valid here.
10091      return LowerToBSwap(CI);
10092    }
10093    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
10094    if (CI->getType()->isIntegerTy(16) &&
10095        AsmPieces.size() == 3 &&
10096        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
10097        AsmPieces[1] == "$$8," &&
10098        AsmPieces[2] == "${0:w}" &&
10099        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
10100      AsmPieces.clear();
10101      const std::string &Constraints = IA->getConstraintString();
10102      SplitString(StringRef(Constraints).substr(5), AsmPieces, ",");
10103      std::sort(AsmPieces.begin(), AsmPieces.end());
10104      if (AsmPieces.size() == 4 &&
10105          AsmPieces[0] == "~{cc}" &&
10106          AsmPieces[1] == "~{dirflag}" &&
10107          AsmPieces[2] == "~{flags}" &&
10108          AsmPieces[3] == "~{fpsr}") {
10109        return LowerToBSwap(CI);
10110      }
10111    }
10112    break;
10113  case 3:
10114    if (CI->getType()->isIntegerTy(64) &&
10115        Constraints.size() >= 2 &&
10116        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
10117        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
10118      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
10119      SmallVector<StringRef, 4> Words;
10120      SplitString(AsmPieces[0], Words, " \t");
10121      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
10122        Words.clear();
10123        SplitString(AsmPieces[1], Words, " \t");
10124        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
10125          Words.clear();
10126          SplitString(AsmPieces[2], Words, " \t,");
10127          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
10128              Words[2] == "%edx") {
10129            return LowerToBSwap(CI);
10130          }
10131        }
10132      }
10133    }
10134    break;
10135  }
10136  return false;
10137}
10138
10139
10140
10141/// getConstraintType - Given a constraint letter, return the type of
10142/// constraint it is for this target.
10143X86TargetLowering::ConstraintType
10144X86TargetLowering::getConstraintType(const std::string &Constraint) const {
10145  if (Constraint.size() == 1) {
10146    switch (Constraint[0]) {
10147    case 'A':
10148      return C_Register;
10149    case 'f':
10150    case 'r':
10151    case 'R':
10152    case 'l':
10153    case 'q':
10154    case 'Q':
10155    case 'x':
10156    case 'y':
10157    case 'Y':
10158      return C_RegisterClass;
10159    case 'e':
10160    case 'Z':
10161      return C_Other;
10162    default:
10163      break;
10164    }
10165  }
10166  return TargetLowering::getConstraintType(Constraint);
10167}
10168
10169/// LowerXConstraint - try to replace an X constraint, which matches anything,
10170/// with another that has more specific requirements based on the type of the
10171/// corresponding operand.
10172const char *X86TargetLowering::
10173LowerXConstraint(EVT ConstraintVT) const {
10174  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
10175  // 'f' like normal targets.
10176  if (ConstraintVT.isFloatingPoint()) {
10177    if (Subtarget->hasSSE2())
10178      return "Y";
10179    if (Subtarget->hasSSE1())
10180      return "x";
10181  }
10182
10183  return TargetLowering::LowerXConstraint(ConstraintVT);
10184}
10185
10186/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10187/// vector.  If it is invalid, don't add anything to Ops.
10188void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
10189                                                     char Constraint,
10190                                                     std::vector<SDValue>&Ops,
10191                                                     SelectionDAG &DAG) const {
10192  SDValue Result(0, 0);
10193
10194  switch (Constraint) {
10195  default: break;
10196  case 'I':
10197    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10198      if (C->getZExtValue() <= 31) {
10199        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10200        break;
10201      }
10202    }
10203    return;
10204  case 'J':
10205    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10206      if (C->getZExtValue() <= 63) {
10207        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10208        break;
10209      }
10210    }
10211    return;
10212  case 'K':
10213    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10214      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
10215        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10216        break;
10217      }
10218    }
10219    return;
10220  case 'N':
10221    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10222      if (C->getZExtValue() <= 255) {
10223        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10224        break;
10225      }
10226    }
10227    return;
10228  case 'e': {
10229    // 32-bit signed value
10230    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10231      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
10232                                           C->getSExtValue())) {
10233        // Widen to 64 bits here to get it sign extended.
10234        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
10235        break;
10236      }
10237    // FIXME gcc accepts some relocatable values here too, but only in certain
10238    // memory models; it's complicated.
10239    }
10240    return;
10241  }
10242  case 'Z': {
10243    // 32-bit unsigned value
10244    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10245      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
10246                                           C->getZExtValue())) {
10247        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10248        break;
10249      }
10250    }
10251    // FIXME gcc accepts some relocatable values here too, but only in certain
10252    // memory models; it's complicated.
10253    return;
10254  }
10255  case 'i': {
10256    // Literal immediates are always ok.
10257    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
10258      // Widen to 64 bits here to get it sign extended.
10259      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
10260      break;
10261    }
10262
10263    // In any sort of PIC mode addresses need to be computed at runtime by
10264    // adding in a register or some sort of table lookup.  These can't
10265    // be used as immediates.
10266    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
10267      return;
10268
10269    // If we are in non-pic codegen mode, we allow the address of a global (with
10270    // an optional displacement) to be used with 'i'.
10271    GlobalAddressSDNode *GA = 0;
10272    int64_t Offset = 0;
10273
10274    // Match either (GA), (GA+C), (GA+C1+C2), etc.
10275    while (1) {
10276      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
10277        Offset += GA->getOffset();
10278        break;
10279      } else if (Op.getOpcode() == ISD::ADD) {
10280        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
10281          Offset += C->getZExtValue();
10282          Op = Op.getOperand(0);
10283          continue;
10284        }
10285      } else if (Op.getOpcode() == ISD::SUB) {
10286        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
10287          Offset += -C->getZExtValue();
10288          Op = Op.getOperand(0);
10289          continue;
10290        }
10291      }
10292
10293      // Otherwise, this isn't something we can handle, reject it.
10294      return;
10295    }
10296
10297    const GlobalValue *GV = GA->getGlobal();
10298    // If we require an extra load to get this address, as in PIC mode, we
10299    // can't accept it.
10300    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
10301                                                        getTargetMachine())))
10302      return;
10303
10304    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
10305                                        GA->getValueType(0), Offset);
10306    break;
10307  }
10308  }
10309
10310  if (Result.getNode()) {
10311    Ops.push_back(Result);
10312    return;
10313  }
10314  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10315}
10316
10317std::vector<unsigned> X86TargetLowering::
10318getRegClassForInlineAsmConstraint(const std::string &Constraint,
10319                                  EVT VT) const {
10320  if (Constraint.size() == 1) {
10321    // FIXME: not handling fp-stack yet!
10322    switch (Constraint[0]) {      // GCC X86 Constraint Letters
10323    default: break;  // Unknown constraint letter
10324    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
10325      if (Subtarget->is64Bit()) {
10326        if (VT == MVT::i32)
10327          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
10328                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
10329                                       X86::R10D,X86::R11D,X86::R12D,
10330                                       X86::R13D,X86::R14D,X86::R15D,
10331                                       X86::EBP, X86::ESP, 0);
10332        else if (VT == MVT::i16)
10333          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
10334                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
10335                                       X86::R10W,X86::R11W,X86::R12W,
10336                                       X86::R13W,X86::R14W,X86::R15W,
10337                                       X86::BP,  X86::SP, 0);
10338        else if (VT == MVT::i8)
10339          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
10340                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
10341                                       X86::R10B,X86::R11B,X86::R12B,
10342                                       X86::R13B,X86::R14B,X86::R15B,
10343                                       X86::BPL, X86::SPL, 0);
10344
10345        else if (VT == MVT::i64)
10346          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
10347                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
10348                                       X86::R10, X86::R11, X86::R12,
10349                                       X86::R13, X86::R14, X86::R15,
10350                                       X86::RBP, X86::RSP, 0);
10351
10352        break;
10353      }
10354      // 32-bit fallthrough
10355    case 'Q':   // Q_REGS
10356      if (VT == MVT::i32)
10357        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
10358      else if (VT == MVT::i16)
10359        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
10360      else if (VT == MVT::i8)
10361        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
10362      else if (VT == MVT::i64)
10363        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
10364      break;
10365    }
10366  }
10367
10368  return std::vector<unsigned>();
10369}
10370
10371std::pair<unsigned, const TargetRegisterClass*>
10372X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
10373                                                EVT VT) const {
10374  // First, see if this is a constraint that directly corresponds to an LLVM
10375  // register class.
10376  if (Constraint.size() == 1) {
10377    // GCC Constraint Letters
10378    switch (Constraint[0]) {
10379    default: break;
10380    case 'r':   // GENERAL_REGS
10381    case 'l':   // INDEX_REGS
10382      if (VT == MVT::i8)
10383        return std::make_pair(0U, X86::GR8RegisterClass);
10384      if (VT == MVT::i16)
10385        return std::make_pair(0U, X86::GR16RegisterClass);
10386      if (VT == MVT::i32 || !Subtarget->is64Bit())
10387        return std::make_pair(0U, X86::GR32RegisterClass);
10388      return std::make_pair(0U, X86::GR64RegisterClass);
10389    case 'R':   // LEGACY_REGS
10390      if (VT == MVT::i8)
10391        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
10392      if (VT == MVT::i16)
10393        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
10394      if (VT == MVT::i32 || !Subtarget->is64Bit())
10395        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
10396      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
10397    case 'f':  // FP Stack registers.
10398      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
10399      // value to the correct fpstack register class.
10400      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
10401        return std::make_pair(0U, X86::RFP32RegisterClass);
10402      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
10403        return std::make_pair(0U, X86::RFP64RegisterClass);
10404      return std::make_pair(0U, X86::RFP80RegisterClass);
10405    case 'y':   // MMX_REGS if MMX allowed.
10406      if (!Subtarget->hasMMX()) break;
10407      return std::make_pair(0U, X86::VR64RegisterClass);
10408    case 'Y':   // SSE_REGS if SSE2 allowed
10409      if (!Subtarget->hasSSE2()) break;
10410      // FALL THROUGH.
10411    case 'x':   // SSE_REGS if SSE1 allowed
10412      if (!Subtarget->hasSSE1()) break;
10413
10414      switch (VT.getSimpleVT().SimpleTy) {
10415      default: break;
10416      // Scalar SSE types.
10417      case MVT::f32:
10418      case MVT::i32:
10419        return std::make_pair(0U, X86::FR32RegisterClass);
10420      case MVT::f64:
10421      case MVT::i64:
10422        return std::make_pair(0U, X86::FR64RegisterClass);
10423      // Vector types.
10424      case MVT::v16i8:
10425      case MVT::v8i16:
10426      case MVT::v4i32:
10427      case MVT::v2i64:
10428      case MVT::v4f32:
10429      case MVT::v2f64:
10430        return std::make_pair(0U, X86::VR128RegisterClass);
10431      }
10432      break;
10433    }
10434  }
10435
10436  // Use the default implementation in TargetLowering to convert the register
10437  // constraint into a member of a register class.
10438  std::pair<unsigned, const TargetRegisterClass*> Res;
10439  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
10440
10441  // Not found as a standard register?
10442  if (Res.second == 0) {
10443    // Map st(0) -> st(7) -> ST0
10444    if (Constraint.size() == 7 && Constraint[0] == '{' &&
10445        tolower(Constraint[1]) == 's' &&
10446        tolower(Constraint[2]) == 't' &&
10447        Constraint[3] == '(' &&
10448        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
10449        Constraint[5] == ')' &&
10450        Constraint[6] == '}') {
10451
10452      Res.first = X86::ST0+Constraint[4]-'0';
10453      Res.second = X86::RFP80RegisterClass;
10454      return Res;
10455    }
10456
10457    // GCC allows "st(0)" to be called just plain "st".
10458    if (StringRef("{st}").equals_lower(Constraint)) {
10459      Res.first = X86::ST0;
10460      Res.second = X86::RFP80RegisterClass;
10461      return Res;
10462    }
10463
10464    // flags -> EFLAGS
10465    if (StringRef("{flags}").equals_lower(Constraint)) {
10466      Res.first = X86::EFLAGS;
10467      Res.second = X86::CCRRegisterClass;
10468      return Res;
10469    }
10470
10471    // 'A' means EAX + EDX.
10472    if (Constraint == "A") {
10473      Res.first = X86::EAX;
10474      Res.second = X86::GR32_ADRegisterClass;
10475      return Res;
10476    }
10477    return Res;
10478  }
10479
10480  // Otherwise, check to see if this is a register class of the wrong value
10481  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
10482  // turn into {ax},{dx}.
10483  if (Res.second->hasType(VT))
10484    return Res;   // Correct type already, nothing to do.
10485
10486  // All of the single-register GCC register classes map their values onto
10487  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
10488  // really want an 8-bit or 32-bit register, map to the appropriate register
10489  // class and return the appropriate register.
10490  if (Res.second == X86::GR16RegisterClass) {
10491    if (VT == MVT::i8) {
10492      unsigned DestReg = 0;
10493      switch (Res.first) {
10494      default: break;
10495      case X86::AX: DestReg = X86::AL; break;
10496      case X86::DX: DestReg = X86::DL; break;
10497      case X86::CX: DestReg = X86::CL; break;
10498      case X86::BX: DestReg = X86::BL; break;
10499      }
10500      if (DestReg) {
10501        Res.first = DestReg;
10502        Res.second = X86::GR8RegisterClass;
10503      }
10504    } else if (VT == MVT::i32) {
10505      unsigned DestReg = 0;
10506      switch (Res.first) {
10507      default: break;
10508      case X86::AX: DestReg = X86::EAX; break;
10509      case X86::DX: DestReg = X86::EDX; break;
10510      case X86::CX: DestReg = X86::ECX; break;
10511      case X86::BX: DestReg = X86::EBX; break;
10512      case X86::SI: DestReg = X86::ESI; break;
10513      case X86::DI: DestReg = X86::EDI; break;
10514      case X86::BP: DestReg = X86::EBP; break;
10515      case X86::SP: DestReg = X86::ESP; break;
10516      }
10517      if (DestReg) {
10518        Res.first = DestReg;
10519        Res.second = X86::GR32RegisterClass;
10520      }
10521    } else if (VT == MVT::i64) {
10522      unsigned DestReg = 0;
10523      switch (Res.first) {
10524      default: break;
10525      case X86::AX: DestReg = X86::RAX; break;
10526      case X86::DX: DestReg = X86::RDX; break;
10527      case X86::CX: DestReg = X86::RCX; break;
10528      case X86::BX: DestReg = X86::RBX; break;
10529      case X86::SI: DestReg = X86::RSI; break;
10530      case X86::DI: DestReg = X86::RDI; break;
10531      case X86::BP: DestReg = X86::RBP; break;
10532      case X86::SP: DestReg = X86::RSP; break;
10533      }
10534      if (DestReg) {
10535        Res.first = DestReg;
10536        Res.second = X86::GR64RegisterClass;
10537      }
10538    }
10539  } else if (Res.second == X86::FR32RegisterClass ||
10540             Res.second == X86::FR64RegisterClass ||
10541             Res.second == X86::VR128RegisterClass) {
10542    // Handle references to XMM physical registers that got mapped into the
10543    // wrong class.  This can happen with constraints like {xmm0} where the
10544    // target independent register mapper will just pick the first match it can
10545    // find, ignoring the required type.
10546    if (VT == MVT::f32)
10547      Res.second = X86::FR32RegisterClass;
10548    else if (VT == MVT::f64)
10549      Res.second = X86::FR64RegisterClass;
10550    else if (X86::VR128RegisterClass->hasType(VT))
10551      Res.second = X86::VR128RegisterClass;
10552  }
10553
10554  return Res;
10555}
10556